In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer #HugginFace
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.2f}'.format

In [3]:
#export
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import functools

In [4]:
sns.set_theme(style="darkgrid")


In [5]:
#export
from subprocess import *
import os
import shutil

In [6]:
Popen

subprocess.Popen

In [7]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [8]:
def param_default():
    corpus = 'fm_fc_ms_ff' #<-- Scope
    data_path = Path('../athena-datasets/' + corpus + '/')
    data_path_raw = Path('../athena-datasets/' + corpus + '/raw/')
    tokenizer_path = Path('../tokenizer/')
    return {
        'bpe_path' : tokenizer_path / 'universal_tokenizer/universal_tokenizer/roberta_aug_spaces',
        'eval_raw': [data_path_raw / 'eval/input.methods.txt',
                        data_path_raw / 'eval/output.tests.txt'],
        'test_raw': [data_path_raw / 'test/input.methods.txt', 
                        data_path_raw / 'test/output.tests.txt'],
        'train_raw': [data_path_raw / 'train/input.methods.txt', 
                        data_path_raw / 'train/output.tests.txt'],
        'data_labels' : ['eval_raw','test_raw','train_raw'],
        'output_pandas' : data_path / 'pandas/',
        'ck_jar_path':  'tool/ck-metrics.jar',
        'physical_files_path' : 'test_metrics'
    }

In [9]:
params = param_default()
params['bpe_path']

PosixPath('../tokenizer/universal_tokenizer/universal_tokenizer/roberta_aug_spaces')

# Exploratory Code Analysis [Metrics]

In [11]:
# export
def write_dataset_to_files(df_series, destination_path):
    """
    Function to generate .java files.
    
    Params:
    # df_series: Pandas Series (DataFrame column) with the source code records.
    # destination_path: (str) Absolute path to be used as directory for the generated files.
    
    Returns:
    
    Collection of paths for the corresponding java files.
    
    """
    java_template = 'public class <class_name>{\n    <code_snippet>\n}'
    
    if not os.path.exists(destination_path):
        logging.info('Creating directory.')
        os.mkdir(destination_path)
    
    logging.info("Generating physical .java files.")
    
    file_paths = []
    for idx, value in df_series.iteritems():
        class_name = f'ClassRecord{idx}'
        code = java_template.replace('<class_name>', class_name)
        code = code.replace('<code_snippet>', value)
        file_path = f'{destination_path}/{class_name}.java'
        with open(file_path, 'w') as file:
            file.write(code)
            file_paths.append(file_path)
            
    return file_paths

In [12]:
# export
def jarWrapper(*args):
    process = Popen(['java', '-jar']+list(args), stdout=PIPE, stderr=PIPE)
    ret = []
    while process.poll() is None:
        line = process.stdout.readline()
        if line != '' and line.endswith(b'\n'):
            ret.append(line[:-1])
    stdout, stderr = process.communicate()
    
    ret += stdout.split(b'\n')
    if stderr != '':
        ret += stderr.split(b'\n')
        
    if '' in ret:
        ret.remove('')
    return ret

In [13]:

# export
class JavaAnalyzer():
    """
    Class get metrics f
    """
    def __init__(self, ck_jar_path):
        self.ck_jar_path = ck_jar_path
    
    def compute_metrics(self, df_series, files_destination_path):
        """
        Computes metrics for a pandas series of java source code snippets
        
        Params
        # df_series: Pandas series (df column) containing java source snippets
        # files_destination_path: Path indicating where the physical .java files are going to be created (for metrics computation)
        
        Returns:
        
        Pandas Dataframe containing metrics
        
        """
        file_paths = write_dataset_to_files(df_series, files_destination_path)
        self.__call_ck_package(files_destination_path)
        metrics_df = self.__get_metrics_df()
        self.__remove_csv_files()
        self.__remove_tmp_java_files(file_paths)
        
        return metrics_df
        
    def __call_ck_package(self, files_path):
        """
        Performs call to external .jar package.
        """
        args = [self.ck_jar_path, files_path, 'false', '0', 'True']
        result = jarWrapper(*args)
        logging.info(f'CK package produced this output:\n{result}')
        
    def __get_metrics_df(self):
        """
        Reads report files (csv) generated by the CK package.
        
        Returns:
        
        Pandas Dataframe containing appropriate metrics
        """
        class_metrics_df = pd.read_csv('class.csv')
        # method_metrics_df = pd.read_csv('method.csv')

        # merged_df = pd.merge(left = class_metrics_df, right = method_metrics_df, left_on='file', right_on='file')

        appropriate_columns = ['file','class', 'wmc', 'totalMethodsQty', 'staticMethodsQty', 'publicMethodsQty', 'privateMethodsQty',
                          'protectedMethodsQty', 'defaultMethodsQty', 'abstractMethodsQty', 'finalMethodsQty','synchronizedMethodsQty',
                          'totalFieldsQty', 'staticFieldsQty', 'publicFieldsQty', 'privateFieldsQty', 'protectedFieldsQty',
                          'defaultFieldsQty', 'visibleFieldsQty', 'finalFieldsQty', 'synchronizedFieldsQty',
                          'nosi', 'loc', 'returnQty', 'loopQty', 'comparisonsQty', 'tryCatchQty', 'parenthesizedExpsQty',
                          'stringLiteralsQty', 'numbersQty', 'assignmentsQty', 'mathOperationsQty', 'variablesQty', 'maxNestedBlocksQty',
                          'anonymousClassesQty', 'innerClassesQty', 'lambdasQty', 'uniqueWordsQty', 'modifiers']

        class_metrics_df = class_metrics_df[appropriate_columns]

        return class_metrics_df
    
    def __remove_csv_files(self):
        """
        Removes files generated by CK package.
        """
        if os.path.exists('class.csv'):
            os.remove('class.csv')
        if os.path.exists('method.csv'):
            os.remove('method.csv')
        if os.path.exists('field.csv'):
            os.remove('field.csv')
            
    def __remove_tmp_java_files(self, paths):
        """
        Removes the temporary generated java files.
        """
        for file_path in paths:
            os.remove(file_path)

In [14]:
# Loading Json Sets
def load_checkpoint_1():
    super_df = {}
    for label in params['data_labels']:
        for val, _ in enumerate(params[ label ]):
            super_df[ label+str(val) ] = pd.read_json( params['output_pandas'] / (label+str(val) +'.json')  )
            print("read:",label+str(val))
    return super_df

In [15]:
super_data = load_checkpoint_1()

read: eval_raw0
read: eval_raw1
read: test_raw0
read: test_raw1
read: train_raw0
read: train_raw1


In [16]:
java_analyzer = JavaAnalyzer( params['ck_jar_path'] )

In [17]:
df = super_data['eval_raw0']

In [18]:
df['eval_raw0']

0        RosetteAbstractProcessor extends AbstractProce...
1        DateHelper { public synchronized static String...
2        UiLesson implements Serializable { public bool...
3        NormalizationUtil { static public FieldValue n...
4        FunctionUtil { static public FieldValue evalua...
                               ...                        
78529    MsModelUtils { public MsModelUtils() { } MsMod...
78530    OnapPdpEngineFactory extends PDPEngineFactory ...
78531    PolicyScopeDictionaryController { @RequestMapp...
78532    ClosedLoopDictionaryController { @RequestMappi...
78533    MicroServiceDictionaryController { @RequestMap...
Name: eval_raw0, Length: 78534, dtype: object

In [19]:
#creating files
df_cross_v0_paths = write_dataset_to_files( super_data['eval_raw0']['eval_raw0'], params['physical_files_path'] )


2021-10-29 22:52:33,974 : INFO : Generating physical .java files.


In [21]:
len(df_cross_v0_paths)

78534

In [22]:
df['file'] = df_cross_v0_paths

In [23]:
df.head()

Unnamed: 0,eval_raw0,eval_raw_bpe0,method_size0,file
0,RosetteAbstractProcessor extends AbstractProce...,"[Ros, ette, Abstract, Process, or, Ġextends, Ġ...",220,test_metrics/ClassRecord0.java
1,DateHelper { public synchronized static String...,"[Date, Helper, Ġ{, Ġpublic, Ġsynchronized, Ġst...",76,test_metrics/ClassRecord1.java
2,UiLesson implements Serializable { public bool...,"[U, i, Less, on, Ġimplements, ĠSerial, izable,...",126,test_metrics/ClassRecord2.java
3,NormalizationUtil { static public FieldValue n...,"[Normal, ization, Ut, il, Ġ{, Ġstatic, Ġpublic...",103,test_metrics/ClassRecord3.java
4,FunctionUtil { static public FieldValue evalua...,"[Function, Ut, il, Ġ{, Ġstatic, Ġpublic, ĠFiel...",162,test_metrics/ClassRecord4.java


In [24]:
# To run this piece of code it is necessary to install a Java Virtual Machine or Java11
df_java_metrics = java_analyzer.compute_metrics( df['eval_raw0'], params['physical_files_path'] )


2021-10-29 22:54:35,069 : INFO : Generating physical .java files.
Bad pipe message: %s [b'$\x10\xd9\x9a\x84\x81\x0fy\x92CT\xbc\x92\xf6\xfa\xe3\x893 jU\xe0Q\xff\x10\x0c\x80\xa7AY\t\xaen/\x80\xd0\x85\xdf5\x9f\xc6\x12\xe8\x0b"G\xb0\xe3\x939|\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x0b\xa9\xb9\x1b\x85\x87<3\xf8v\x84\xbd\x04.\x88\xc1\x9f\xf4\x13I\xf6\x85']
Bad pipe message: %s [b'\tb9g\x1e\xa2\t\xcdF\xa1v~\x11\xb1\x135\x07\xaa\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0', b"/\x00\xa2\x00\

In [25]:
df_java_metrics

Unnamed: 0,file,class,wmc,totalMethodsQty,staticMethodsQty,publicMethodsQty,privateMethodsQty,protectedMethodsQty,defaultMethodsQty,abstractMethodsQty,...,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers
0,/data/david/dummy/test_metrics/ClassRecord9048...,ClassRecord9048,15,10,10,1,0,0,9,10,...,5,2,2,1,1,0,0,0,21,1
1,/data/david/dummy/test_metrics/ClassRecord5462...,ClassRecord54623,11,10,0,0,0,1,9,11,...,0,0,0,1,0,0,0,0,23,1
2,/data/david/dummy/test_metrics/ClassRecord2884...,ClassRecord28847,14,5,0,1,0,1,3,5,...,0,9,1,5,1,0,0,0,46,1
3,/data/david/dummy/test_metrics/ClassRecord3612...,ClassRecord36123,20,19,0,1,0,0,18,19,...,0,0,0,0,0,0,0,0,44,1
4,/data/david/dummy/test_metrics/ClassRecord7646...,ClassRecord76461,7,3,0,1,0,0,2,3,...,1,4,0,3,1,0,0,0,24,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81270,/data/david/dummy/test_metrics/ClassRecord4446...,ClassRecord44463,8,4,0,1,0,0,3,4,...,0,2,1,2,2,0,0,0,37,1
81271,/data/david/dummy/test_metrics/ClassRecord2636...,ClassRecord26363,5,4,4,1,0,0,3,5,...,0,0,0,1,0,0,0,0,18,1
81272,/data/david/dummy/test_metrics/ClassRecord6585...,ClassRecord65856,84,79,0,1,0,0,78,79,...,0,3,0,2,2,0,0,0,103,1
81273,/data/david/dummy/test_metrics/ClassRecord1963...,ClassRecord19632,14,11,0,1,0,0,10,11,...,1,1,0,1,1,0,0,0,25,1


In [26]:
df_java_metrics.to_json( params['output_pandas'] / ('eval_raw_metrics' +'.json') ) #serialization

In [1]:
df_java_metrics.describe()

NameError: name 'df_java_metrics' is not defined