In [None]:
# default_exp data.eda

In [None]:
# hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
! pip install sentencepiece

# Exploratory Data Analysis 
>
>@danaderp 11.17.20 This is an exploratory data analysis for the codesearch net dataset. The goal of the exploration is to understand the code distribution of token information.
>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import time

In [None]:
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
import functools
from operator import or_
from collections import Counter
import datetime

In [None]:
import sentencepiece as spm

In [None]:
import pickle

In [None]:
# export
import logging

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)

Fixing parameters

In [None]:
def params():
    return {
        "dummy_path": "../dvc-icodegen/dummy/",
        "hyper": {
            "seq_length": 100,
            "batch_size": 64,
            "buffer_size": 10000,
            "rnn_units": 1024,
            "epochs": 10,
        },
        "checkpoint_dir": "../dvc-icodegen/models/",
        "dataset": "../dvc-icodegen/searchnet/[codesearchnet-java-1597073966.81902].csv",
        "bpe": "../dvc-icodegen/bpe/tokenizer-java",
        "eda": "../dvc-icodegen/eda/",
    }

In [None]:
params = params()

In [None]:
# tst
logging.info(params["hyper"]["seq_length"])
logging.info(params["dataset"])
logging.info(params["eda"])

2020-11-17 16:06:10,751 : INFO : 100
2020-11-17 16:06:10,752 : INFO : ../dvc-icodegen/searchnet/[codesearchnet-java-1597073966.81902].csv
2020-11-17 16:06:10,753 : INFO : ../dvc-icodegen/eda/


# Loading Datasets

In [None]:
java_df = pd.read_csv(params["dataset"], sep="~")

In [None]:
java_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,apache/spark,sql/hive-thriftserver/src/main/java/org/apache...,https://github.com/apache/spark/blob/25ee0474f...,<@>Override<n> <public> OperationHandle getSc...,"['@', 'Override', 'public', 'OperationHandle',...",/* (non-Javadoc)\n@see org.apache.hive.service...,"['/', '*', '(', 'non', '-', 'Javadoc', ')']",java,train,"['▁', '@', 'Override', '▁public', '▁Operation'...",31.0,43.0
1,BlueBrain/bluima,modules/bluima_typesystem/src/main/java/de/jul...,https://github.com/BlueBrain/bluima/blob/793ea...,<public> <void> setTextObjects<(>FSArray v<)> ...,"['public', 'void', 'setTextObjects', '(', 'FSA...",setter for textObjects - sets the text objects...,"['setter', 'for', 'textObjects', '-', 'sets', ...",java,train,"['▁public', '▁void', '▁setText', 'Objects', '(...",64.0,92.0
2,aws/aws-sdk-java,aws-java-sdk-athena/src/main/java/com/amazonaw...,https://github.com/aws/aws-sdk-java/blob/aa385...,<public> <void> marshall<(>WorkGroup workGroup...,"['public', 'void', 'marshall', '(', 'WorkGroup...",Marshall the given parameter object.,"['Marshall', 'the', 'given', 'parameter', 'obj...",java,train,"['▁public', '▁void', '▁marshall', '(', 'Work',...",117.0,140.0
3,softindex/datakernel,core-bytebuf/src/main/java/io/datakernel/byteb...,https://github.com/softindex/datakernel/blob/0...,<public> <void> put<(><@>NotNull <byte><[><]> ...,"['public', 'void', 'put', '(', '@', 'NotNull',...",Puts given byte array to the {@link ByteBuf} f...,"['Puts', 'given', 'byte', 'array', 'to', 'the'...",java,train,"['▁public', '▁void', '▁put', '(', '@', 'NotNul...",64.0,72.0
4,lunisolar/magma,magma-func-builder/src/main/java/eu/lunisolar/...,https://github.com/lunisolar/magma/blob/83809c...,<@>Non<null><n>\t<public> <static> <lesser>T<g...,"['@', 'Nonnull', 'public', 'static', '<', 'T',...",One of ways of creating builder. This is possi...,"['One', 'of', 'ways', 'of', 'creating', 'build...",java,train,"['▁', '@', 'Nonnull', '▁public', '▁static', '▁...",46.0,52.0


In [None]:
# Partitions
df_train = java_df[java_df["partition"] == "train"]
df_valid = java_df[java_df["partition"] == "valid"]
df_test = java_df[java_df["partition"] == "test"]
df_bpe = java_df[java_df["partition"] == "bpe"]

In [None]:
list_all_partitions = [df_train, df_valid, df_test, df_bpe]

In [None]:
logging.info("Train Partition Size " + str(df_train.shape))
logging.info("Valid Partition Size " + str(df_valid.shape))
logging.info("Test Partition Size " + str(df_test.shape))
logging.info("BPE Partition Size " + str(df_bpe.shape))

2020-11-17 12:10:39,058 : INFO : Train Partition Size (384868, 12)
2020-11-17 12:10:39,059 : INFO : Valid Partition Size (14605, 12)
2020-11-17 12:10:39,060 : INFO : Test Partition Size (25011, 12)
2020-11-17 12:10:39,062 : INFO : BPE Partition Size (42719, 12)


In [None]:
code_train = df_train.pop("code")

In [None]:
code_train.head()

0    <@>Override<n>  <public> OperationHandle getSc...
1    <public> <void> setTextObjects<(>FSArray v<)> ...
2    <public> <void> marshall<(>WorkGroup workGroup...
3    <public> <void> put<(><@>NotNull <byte><[><]> ...
4    <@>Non<null><n>\t<public> <static> <lesser>T<g...
Name: code, dtype: object

# Data Transformations 

In [None]:
# Avg Number of Subwords
count_subwords = [len(eval(mtd)) for mtd in df_train["code_tokens"].values]

In [None]:
df_train["count_tokens"] = count_subwords

In [None]:
# BPE

# Descriptive Statistics

All Java Set

In [None]:
java_df.describe()

Unnamed: 0,code_len,bpe32_len
count,467203.0,467203.0
mean,113.079653,146.274557
std,189.121245,303.804009
min,20.0,20.0
25%,42.0,50.0
50%,67.0,81.0
75%,122.0,150.0
max,27192.0,52975.0


All Java Partition

In [None]:
df_train.describe()

Unnamed: 0,code_len,bpe32_len
count,384868.0,384868.0
mean,113.720826,147.151002
std,194.007951,313.904001
min,20.0,20.0
25%,42.0,50.0
50%,67.0,81.0
75%,122.0,151.0
max,27192.0,52975.0


In [None]:
logging.info([p.describe() for p in list_all_partitions])

2020-11-17 12:31:54,629 : INFO : [            code_len      bpe32_len
count  384868.000000  384868.000000
mean      113.720826     147.151002
std       194.007951     313.904001
min        20.000000      20.000000
25%        42.000000      50.000000
50%        67.000000      81.000000
75%       122.000000     151.000000
max     27192.000000   52975.000000,            code_len     bpe32_len
count  14605.000000  14605.000000
mean      94.331736    120.097843
std      115.802231    171.079255
min       21.000000     21.000000
25%       39.000000     46.000000
50%       59.000000     71.000000
75%      104.000000    127.000000
max     3099.000000   5747.000000,            code_len     bpe32_len
count  25011.000000  25011.000000
mean     114.274599    148.204710
std      166.432695    245.938732
min       21.000000     22.000000
25%       43.000000     52.000000
50%       69.000000     84.000000
75%      125.000000    155.000000
max     5685.000000  10015.000000,            code_len     bpe

In [None]:
logging.info(
    [stats.median_absolute_deviation(p["code_len"].values) for p in list_all_partitions]
)

2020-11-17 12:29:30,408 : INFO : [45.9606, 37.065, 47.4432, 44.477999999999994]


Train Partition

In [None]:
# Frequent Characters
train_tokens = df_train.code_tokens.values

In [None]:
train_vocab_tokens = [
    eval(method) for method in train_tokens
]  # Evaluating given tokens

In [None]:
train_counter_tokens = [
    Counter(method) for method in train_vocab_tokens
]  # Counting the words

In [None]:
train_counter = functools.reduce(
    lambda a, b: a + b, train_counter_tokens
)  ## [Warning! Time Consuming]

In [None]:
train_counter

NameError: name 'train_counter' is not defined

In [None]:
# Persisting the counter object
with open(
    params["eda"]
    + "["
    + str(datetime.datetime.now())
    + "]-codesearchnet_token_counts.pickle",
    "wb",
) as outputfile:
    pickle.dump(train_counter, outputfile)

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()