In [0]:
# default_exp data.exploratory

# Exploration of your data

> This module comprises all the statistical and inference techniques to describe the inner properties of software data. The submodules might include:
>
> - Descriptive statistics
> - Software Metrics
> - Information Theory
> - Learning Principels Detection (Occams' Razor, Biased data, and Data Snooping)
> - Inference: Probabilistic and Causal

In [0]:
!pip install dit
!pip install sencencepiece

In [0]:
# export
# Imports
import pandas as pd
import sentencepiece as sp
import dit

In [0]:
# #hide
# from nbdev.showdoc import *

In [0]:
#Import dataframe from MongoDB
import os
def simulate_getting_dataframes_from_mongo():
  requirements = {'file_name': [], 'contents': []}
  path = "./requirements"
  for file in os.listdir(path):
    requirements['file_name'].append(file)
    with open (os.path.join(path, file), "r") as f:
      requirements['contents'].append(f.read())
  source_code = {'file_name': [], 'contents': []}
  path = "./source_code"
  for file in os.listdir("./source_code"):
    source_code['file_name'].append(file)
    with open (os.path.join(path, file), "r") as f:
      source_code['contents'].append(f.read())
  req_df = pd.DataFrame(data = requirements)
  src_df = pd.DataFrame(data = source_code)
  return req_df, src_df

In [0]:
# export
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output + 'text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output + 'text.txt'

In [0]:
# export
def gen_sp_model(df, output, model_name, cols = None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output + model_name} --hard_vocab_limit=false --model_type=bpe')
    return output + model_name

In [0]:
#Compute entropy of all the files per system and calculate mean, std, median, and std for median absolute deviation. The idea is to create confidence intervals for each system/dataset
req_df, src_df = get_dataframes_from_mongo()
model_prefix = gen_sp_model(req_df, output='requirements', model_name='_sp_bpe_modal', cols=['contents'])

In [69]:
sp_proc = sp.SentencePieceProcessor()
sp_proc.Load(f"{model_prefix}.model")
freq = {}

# random_subset = bpe_df.sample(n=1000)

for file in req_df.contents:
    encoding = sp_proc.encode_as_pieces(file)
    for piece in encoding:
        freq.setdefault(piece, 0)
        freq[piece] += 1

sorted_freq = sorted(freq.items() ,  key=lambda x: x[1])
print(sorted_freq)



In [25]:
d = dit.Distribution(['1', '2', '3', '4'], [0.125, 0.125, 0.25, 0.5])
print(dit.shannon.entropy(d))

1.75


In [0]:
#Rank the system/datasets according to the confidence intervals
#Compute the confidence intervals for all cross-entropy values
#Rank the systems/datasets according to cross-entropy values
#Top 50 most frequent tokens of each system and corpus (one system has generally two corpora)
#Top 50 least frequent tokes of each system and corpus
#What are the tokens that are in the target and not in the source (and the other way around)? Compute the distribution for those tokens
#What are the mutual tokens (source and target)? please compute distribution
#-Compute confidence intervals for the software metrics on source code (e.g., cyclo, loc, lcom5)


In [0]:
Visualize

In [0]:
Push updated fields to Mongo