# Data exploration (taken from CodeSearchNet)

In [None]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint

## Preview dataset

In [None]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip

--2020-05-24 21:32:35--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.79.38
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.79.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1060569153 (1011M) [application/zip]
Saving to: ‘java.zip’


2020-05-24 21:34:08 (11.0 MB/s) - ‘java.zip’ saved [1060569153/1060569153]



In [None]:
!unzip java.zip

Archive:  java.zip
   creating: java/
   creating: java/final/
   creating: java/final/jsonl/
   creating: java/final/jsonl/train/
  inflating: java/final/jsonl/train/java_train_12.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_9.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_3.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_5.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_7.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_1.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_10.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_14.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_0.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_6.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_8.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_15.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_2.jsonl.gz  
  inflating: java/final/jsonl/train/java_train_4.jsonl.gz  
  inflating: java/final/j

In [None]:
!gzip -d java/final/jsonl/test/java_test_0.jsonl.gz

In [None]:
with open('java/final/jsonl/test/java_test_0.jsonl', 'r') as f:
    sample_file = f.readlines()
sample_file[0]

'{"repo": "ReactiveX/RxJava", "path": "src/main/java/io/reactivex/internal/observers/QueueDrainObserver.java", "func_name": "QueueDrainObserver.fastPathOrderedEmit", "original_string": "protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\\n        final Observer<? super V> observer = downstream;\\n        final SimplePlainQueue<U> q = queue;\\n\\n        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\\n            if (q.isEmpty()) {\\n                accept(observer, value);\\n                if (leave(-1) == 0) {\\n                    return;\\n                }\\n            } else {\\n                q.offer(value);\\n            }\\n        } else {\\n            q.offer(value);\\n            if (!enter()) {\\n                return;\\n            }\\n        }\\n        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\\n    }", "language": "java", "code": "protected final void fastPathOrderedEmit(U value, boolean d

In [None]:
print(type(sample_file))
print(len(sample_file))

<class 'list'>
26909


In [None]:
pprint(json.loads(sample_file[0]))

{'code': 'protected final void fastPathOrderedEmit(U value, boolean '
         'delayError, Disposable disposable) {\n'
         '        final Observer<? super V> observer = downstream;\n'
         '        final SimplePlainQueue<U> q = queue;\n'
         '\n'
         '        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n'
         '            if (q.isEmpty()) {\n'
         '                accept(observer, value);\n'
         '                if (leave(-1) == 0) {\n'
         '                    return;\n'
         '                }\n'
         '            } else {\n'
         '                q.offer(value);\n'
         '            }\n'
         '        } else {\n'
         '            q.offer(value);\n'
         '            if (!enter()) {\n'
         '                return;\n'
         '            }\n'
         '        }\n'
         '        QueueDrainHelper.drainLoop(q, observer, delayError, '
         'disposable, this);\n'
         '    }',
 'code_tokens': ['pr

## Exploring the full DataSet

In [None]:
!ls java/

_sp_bpe_modal.model  _sp_bpe_modal.vocab  final  text.txt


In [None]:
java_files = sorted(Path('java/').glob('**/*.gz'))

In [None]:
print('Total of related java files: {}'.format(len(java_files)))

Total of related java files: 17


In [None]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
java_df = jsonl_list_to_dataframe(java_files)

In [None]:
java_df.head(1)

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,spring-projects/spring-boot,spring-boot-project/spring-boot/src/main/java/org/springframework/boot/context/properties/bind/IndexedElementsBinder.java,https://github.com/spring-projects/spring-boot/blob/0b27f7c70e164b2b1a96477f1d9c1acba56790c1/spring-boot-project/spring-boot/src/main/java/org/springframework/boot/context/properties/bind/IndexedElementsBinder.java#L67-L77,"protected final void bindIndexed(ConfigurationPropertyName name, Bindable<?> target,\n\t\t\tAggregateElementBinder elementBinder, ResolvableType aggregateType,\n\t\t\tResolvableType elementType, IndexedCollectionSupplier result) {\n\t\tfor (ConfigurationPropertySource source : getContext().getSo...","[protected, final, void, bindIndexed, (, ConfigurationPropertyName, name, ,, Bindable, <, ?, >, target, ,, AggregateElementBinder, elementBinder, ,, ResolvableType, aggregateType, ,, ResolvableType, elementType, ,, IndexedCollectionSupplier, result, ), {, for, (, ConfigurationPropertySource, sou...","Bind indexed elements to the supplied collection.\n@param name the name of the property to bind\n@param target the target bindable\n@param elementBinder the binder to use for elements\n@param aggregateType the aggregate type, may be a collection or an array\n@param elementType the element type\n...","[Bind, indexed, elements, to, the, supplied, collection, .]",java,train


## Summary stats.

In [None]:
java_df.partition.value_counts()

train    454451
valid     15328
Name: partition, dtype: int64

In [None]:
java_df.groupby(['partition', 'language'])['code_tokens'].count()

partition  language
train      java        454451
valid      java         15328
Name: code_tokens, dtype: int64

In [None]:
java_df['code_len'] = java_df.code_tokens.apply(lambda x: len(x))
java_df['query_len'] = java_df.docstring_tokens.apply(lambda x: len(x))

Code Length Percentile 

In [None]:
code_len_summary = java_df.groupby('language')['code_len'].quantile([.5, .7, .8, .9, .95])

In [None]:
display(pd.DataFrame(code_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,code_len
language,Unnamed: 1_level_1,Unnamed: 2_level_1
java,0.5,66.0
java,0.7,104.0
java,0.8,142.0
java,0.9,224.0
java,0.95,330.0


Query length percentile by language

In [None]:
query_len_summary = java_df.groupby('language')['query_len'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(query_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,query_len
language,Unnamed: 1_level_1,Unnamed: 2_level_1
java,0.5,11.0
java,0.7,18.0
java,0.8,25.0
java,0.9,39.0
java,0.95,60.0


## Data transformation

In [None]:
java_df.columns

Index(['repo', 'path', 'url', 'code', 'code_tokens', 'docstring',
       'docstring_tokens', 'language', 'partition', 'code_len', 'query_len'],
      dtype='object')

In [None]:
src_code_columns = ['code', 'code_tokens', 'code_len']

In [None]:
java_src_code_df = java_df[src_code_columns]

Visualizing an example

In [None]:
java_src_code_df[:1]['code_tokens']

0    [protected, final, void, bindIndexed, (, ConfigurationPropertyName, name, ,, Bindable, <, ?, >, target, ,, AggregateElementBinder, elementBinder, ,, ResolvableType, aggregateType, ,, ResolvableType, elementType, ,, IndexedCollectionSupplier, result, ), {, for, (, ConfigurationPropertySource, sou...
Name: code_tokens, dtype: object

In [None]:
java_src_code_df.shape

(469779, 3)

In [None]:
data_type_new_column = ['src' for x in range(java_src_code_df.shape[0])]

In [None]:
len(data_type_new_column)

469779

In [None]:
java_src_code_df['data_type'] = data_type_new_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
java_src_code_df.head()

Unnamed: 0,code,code_tokens,code_len,data_type
0,"protected final void bindIndexed(ConfigurationPropertyName name, Bindable<?> target,\n\t\t\tAggregateElementBinder elementBinder, ResolvableType aggregateType,\n\t\t\tResolvableType elementType, IndexedCollectionSupplier result) {\n\t\tfor (ConfigurationPropertySource source : getContext().getSo...","[protected, final, void, bindIndexed, (, ConfigurationPropertyName, name, ,, Bindable, <, ?, >, target, ,, AggregateElementBinder, elementBinder, ,, ResolvableType, aggregateType, ,, ResolvableType, elementType, ,, IndexedCollectionSupplier, result, ), {, for, (, ConfigurationPropertySource, sou...",80,src
1,"public void setServletRegistrationBeans(\n\t\t\tCollection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) {\n\t\tAssert.notNull(servletRegistrationBeans,\n\t\t\t\t""ServletRegistrationBeans must not be null"");\n\t\tthis.servletRegistrationBeans = new LinkedHashSet<>(servletRegist...","[public, void, setServletRegistrationBeans, (, Collection, <, ?, extends, ServletRegistrationBean, <, ?, >, >, servletRegistrationBeans, ), {, Assert, ., notNull, (, servletRegistrationBeans, ,, ""ServletRegistrationBeans must not be null"", ), ;, this, ., servletRegistrationBeans, =, new, LinkedH...",37,src
2,"public void addServletRegistrationBeans(\n\t\t\tServletRegistrationBean<?>... servletRegistrationBeans) {\n\t\tAssert.notNull(servletRegistrationBeans,\n\t\t\t\t""ServletRegistrationBeans must not be null"");\n\t\tCollections.addAll(this.servletRegistrationBeans, servletRegistrationBeans);\n\t}","[public, void, addServletRegistrationBeans, (, ServletRegistrationBean, <, ?, >, ..., servletRegistrationBeans, ), {, Assert, ., notNull, (, servletRegistrationBeans, ,, ""ServletRegistrationBeans must not be null"", ), ;, Collections, ., addAll, (, this, ., servletRegistrationBeans, ,, servletReg...",33,src
3,"public void setServletNames(Collection<String> servletNames) {\n\t\tAssert.notNull(servletNames, ""ServletNames must not be null"");\n\t\tthis.servletNames = new LinkedHashSet<>(servletNames);\n\t}","[public, void, setServletNames, (, Collection, <, String, >, servletNames, ), {, Assert, ., notNull, (, servletNames, ,, ""ServletNames must not be null"", ), ;, this, ., servletNames, =, new, LinkedHashSet, <>, (, servletNames, ), ;, }]",32,src
4,"public void addServletNames(String... servletNames) {\n\t\tAssert.notNull(servletNames, ""ServletNames must not be null"");\n\t\tthis.servletNames.addAll(Arrays.asList(servletNames));\n\t}","[public, void, addServletNames, (, String, ..., servletNames, ), {, Assert, ., notNull, (, servletNames, ,, ""ServletNames must not be null"", ), ;, this, ., servletNames, ., addAll, (, Arrays, ., asList, (, servletNames, ), ), ;, }]",33,src


## Exploratory analysis

In [None]:
# export
# Imports
import dit
import math
import os
import logging

import matplotlib.pyplot as plt
import pandas as pd
import sentencepiece as sp

from collections import Counter
from pathlib import Path
from scipy.stats import sem, t
from statistics import mean, median, stdev
from tqdm.notebook import tqdm

# ds4se
from ds4se.mgmnt.prep.bpe import *
from ds4se.exp.info import *
from ds4se.desc.stats import *

In [None]:
path = Path('../data/traceability/semeru-format/LibEST_semeru_format'); path

PosixPath('../data/traceability/semeru-format/LibEST_semeru_format')

In [None]:
def simulate_getting_dataframes_from_mongo(path):
    corpus_data = {'file_name': [], 'data_type': [], 'contents': []}
    req_path = path / "requirements"
    for file in os.listdir(req_path):
        corpus_data['file_name'].append(file)
        corpus_data['data_type'].append('req')
        with open (os.path.join(req_path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    src_path = path / "source_code"
    for file in os.listdir(src_path):
        corpus_data['file_name'].append(file)
        corpus_data['data_type'].append('src')
        with open (os.path.join(src_path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    tst_path = path / "test"
    for file in os.listdir(tst_path):
        corpus_data['file_name'].append(file)
        corpus_data['data_type'].append('test')
        with open (os.path.join(tst_path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    corpus_df = pd.DataFrame(data = corpus_data)
    return corpus_df

In [None]:
LIB_corpus_df = simulate_getting_dataframes_from_mongo(path)

In [None]:
LIB_corpus_df.head()

Unnamed: 0,file_name,data_type,contents
0,RQ17.txt,req,"REQUIREMENT 17: HTTP URIS FOR CONTROL\n \nThe EST server MUST support the use of the path-prefix of ""/.well- known/"" as defined in [RFC5785] and the registered name of ""est"". Thus, a valid EST server URI path begins with ""https://www.example.com/.well-known/est"". Each EST operation is indica..."
1,RQ46.txt,req,"REQUIREMENT 46: SERVER-SIDE KEY GENERATION RESPONSE\n \nIf the request is successful, the server response MUST have an HTTP 200 response code with a content-type of ""multipart/mixed"" consisting of two parts: one part is the private key data and the other part is the certificate data.\n\nThe ..."
2,RQ18.txt,req,"REQUIREMENT 18: HTTP-BASED CLIENT AUTHENTICATION\n \nThe EST server MAY request HTTP-based client authentication. This request can be in addition to successful TLS client authentication (Section 3.3.2) if EST server policy requires additional authentication. (For example, the EST server may ..."
3,RQ48.txt,req,"REQUIREMENT 48: CSR ATTRIBUTES REQUEST\n \nThe EST client requests a list of CA-desired CSR attributes from the CA by sending an HTTPS GET message to the EST server with an operations path of ""/csrattrs""."
4,RQ42.txt,req,"REQUIREMENT 42: SERVER-SIDE KEY GENERATION\n \nAn EST client may request a private key and associated certificate from an EST server using an HTTPS POST with an operation path value of ""/serverkeygen"". Support for the /serverkeygen function is OPTIONAL.\n\nA client MUST authenticate an EST s..."


In [None]:
LIB_corpus_df.shape

(87, 3)

In [None]:
java_path = Path('java/')

In [None]:
sp_model_from_df(java_src_code_df, output=java_path, model_name='_sp_bpe_modal', cols=['code'])

In [None]:
sp_processor = sp.SentencePieceProcessor()
sp_processor.Load(f"{java_path/'_sp_bpe_modal'}.model")

True

In [None]:
java_src_code_df.shape

(469779, 4)

In [None]:
java_code_df = java_src_code_df.sample(n=10000)

In [None]:
java_code_df.shape

(10000, 4)

In [None]:
# Use the model to compute each file's entropy
java_doc_entropies = get_doc_entropies_from_df(java_code_df, 'code', java_path/'_sp_bpe_modal', ['src'])

In [None]:
print(java_doc_entropies)

[[5.689119141343584, 4.51839711669891, 5.129774456203049, 4.704511459715549, 4.988758439731456, 5.447961177183743, 5.244803663125517, 4.225619876671972, 5.848162822532568, 6.352343968852069, 4.865433135830284, 4.862644713288645, 5.173348868823392, 4.646072217435267, 5.038562939644918, 6.549913769124969, 4.639721323486284, 6.205991447736061, 5.8592632326780745, 4.229003731107053, 5.272846213149293, 4.656238668556686, 4.98418371977919, 5.234069531114784, 5.488423812848865, 4.8625759375402735, 5.525835888151542, 5.335986732962039, 4.538909765557392, 4.638522284065602, 5.841055561433243, 4.803055907333278, 4.885207329665017, 4.888788245606211, 6.115749936480819, 4.991348882946702, 4.949660533175186, 5.292758331677116, 5.70036093697389, 5.0931337391011064, 5.070159765557392, 4.538167351804303, 5.297022937710956, 5.763522926438308, 5.666110968098239, 4.961210371792345, 4.683364648336087, 5.117371992982752, 4.719349267862323, 5.026986833359288, 4.6119493340804425, 5.190656750337309, 5.1403441

In [None]:
# Use the model to compute each file's entropy
java_corpus_entropies = get_corpus_entropies_from_df(java_code_df, 'code', path/'_sp_bpe_modal', ['src'])

In [None]:
java_corpus_entropies

[7.861571334126392]

In [None]:
# Use the model to compute each file's entropy
java_system_entropy = get_system_entropy_from_df(java_code_df, 'code', path/'_sp_bpe_modal')

In [None]:
java_system_entropy

7.861571334126392

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
report_stats(flatten(java_doc_entropies))

In [None]:
java_doc_entropies

[[5.094551758782526,
  4.556752184325417,
  6.96774360935608,
  4.513153408920675,
  4.7718824009949685,
  5.489183191567506,
  4.528465488430548,
  4.44475497457937,
  4.810747068619437,
  4.427135636004301]]

In [None]:
# Create a histogram of the entropy distribution
plt.hist(java_doc_entropies, bins = 20)
plt.ylabel("Num Files")
plt.xlabel("Entropy")
plt.show()