In [0]:
# default_exp data.exploratory.se.metrics

# Exploration of your data

> This module comprises some of the statistical and inference techniques to describe the inner properties of software data. The submodules might include:
>
> - Descriptive statistics
> - Software Metrics
> - Information Theory
> - Learning Principels Detection (Occams' Razor, Biased data, and Data Snooping)
> - Inference: Probabilistic and Causal

### Specifically in this module

> - Cyclomatic complexity (CYCLO)
> - Coupling Between Objects (CBO)
> - Lack of Cohesion of Methods 5 (LCOM5)
> - Top/Least 20 Occurring Tokens
> - Percentage of redundancy (just Java)
> - How many Type I and Type II clones are in Java datasets?

In [0]:
# #hide
# from nbdev.showdoc import *

In [3]:
!pip install sentencepiece
!pip install metrics

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85
Collecting metrics
  Downloading https://files.pythonhosted.org/packages/01/ae/3ab18f2f3449f2e7931112c991ade9684eeddf96cea03ea7f662c01f0658/metrics-0.3.3.tar.gz
Collecting Pygments==2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/02/ee/b6e02dc6529e82b75bb06823ff7d005b141037cb1416b10c6f00fc419dca/Pygments-2.2.0-py2.py3-none-any.whl (841kB)
[K     |████████████████████████████████| 849kB 3.9MB/s 
[?25hCollecting pathspec==0.5.5
  Downloading https://files.pythonhosted.org/packages/9f/fb/5a901a3b1eeebf83af6da74ecca69d7daf5189e450f0f4cccf9c19132651/pathspec-0.5.5.tar.gz
Collecting pathlib2>=2.3.0
  Downloading

In [0]:
# export
# Imports
import pandas as pd
import sentencepiece as sp
from numpy import mean, std
from statistics import median
from scipy.stats import sem, t

# TODO: Remove when mongo call is implemented
import os

In [0]:
# TODO: Replace with actual mongo call
def simulate_getting_dataframes_from_mongo():
    corpus_data = {'file_name': [], 'data_type': [], 'contents': []}
    path = "./source_code"
    for file in os.listdir(path):
        corpus_data['file_name'].append(file)
        corpus_data['data_type'].append('src')
        with open (os.path.join(path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    path = "./tests"
    for file in os.listdir(path):
        corpus_data['file_name'].append(file)
        corpus_data['data_type'].append('test')
        with open (os.path.join(path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    corpus_df = pd.DataFrame(data = corpus_data)
    return corpus_df

In [0]:
# export
# TODO: Change based on format of input data from Mongo
def df_to_txt_file(df, output, cols):
    """Converts a dataframe into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output + '_text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output + '_text.txt'

In [0]:
# export
# TODO: Change based on format of input data from Mongo
def gen_sp_model(df, output, model_name, cols=None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output + model_name} --hard_vocab_limit=false --model_type=bpe')
    return output + model_name

In [0]:
# export
def add_length_col(df, col, length_col_name='length'):
    """Adds a length column with the length of entries in the specified column of a dataframe"""
    df[length_col_name] = df[col].astype(str).apply(len)
    return df

In [0]:
# export
def stat_metrics(df, col, data_types=None, conf = 0.95, sig_figs = 4):
    """Computes statistical metrics about the entries in a dataframe column"""
    if data_types != None: 
        df = df[df['data_type'].isin(data_types)]
        print(f"~~Printing metrics for {data_types} in column [{col}]~~")
    else:
        print(f"~~Printing metrics for all data in column [{col}]~~")

    print("Min =", round(df[col].min(), sig_figs))
    print("Max =", round(df[col].max(), sig_figs))
    print("Average =", round(df[col].mean(), sig_figs))
    print("Median =", round(median(df[col]), sig_figs))
    print("Standard Deviation =", round(std(df[col]), sig_figs))
    
    n = len(df[col])
    m = mean(df[col])
    std_err = sem(df[col])
    h = std_err * t.ppf((1 + conf) / 2, n - 1)

    start = m - h
    end = m + h
    print(f"{conf} of data points fall between {round(start, sig_figs)} and {round(end, sig_figs)}")

In [0]:
import subprocess
import json
def add_mccabe_metrics(df, col):
    mccabe = []
    sloc = []
    comments = []
    ratio = []
    for i in range(len(df)):
        if i == 99:
            print("At file #100", end="")
        if ((i + 1) % 200) == 0:
            print(", file #" + str(i + 1), end="")
        ext = df["file_name"][i].split('.')[-1]
        if ext == 'h':
            ext = 'c'
        with open(f"temp_file.{ext}", "w", encoding='UTF-8') as fp:
            fp.write(df[col][i])
        out = subprocess.Popen(['metrics', '-q', '--format=json', f"temp_file.{ext}"], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.STDOUT)
        stdout,stderr = out.communicate()
        metrics_data = json.loads(stdout)
        mccabe.append(metrics_data['files'][f"temp_file.{ext}"]['mccabe'])
        sloc.append(metrics_data['files'][f"temp_file.{ext}"]['sloc'])
        comments.append(metrics_data['files'][f"temp_file.{ext}"]['comments'])
        ratio.append(metrics_data['files'][f"temp_file.{ext}"]['ratio_comment_to_code'])
#         print(mccabe_data['files'][f'temp_file.{LANGUAGE}'].keys())
        os.remove(f"temp_file.{ext}")
    df["complexity"] = mccabe
    df["code_lines"] = sloc
    df["comments"] = comments
    df["ratio_comment_to_code"] = ratio
    return df

# Analysis & Visualization

In [10]:
# Retrieve the data
corpus_df = simulate_getting_dataframes_from_mongo()
src_df = corpus_df.loc[corpus_df['data_type'] == 'src']
test_df = corpus_df.loc[corpus_df['data_type'] == 'test']

total_file_count = len(corpus_df)
src_files_count = len(src_df)
test_files_count = len(test_df)
print("Source Code Files:", src_files_count, "(" + str(round(src_files_count/total_file_count, 4)*100) + "%)")
print("Test Code Files:", test_files_count, "(" + str(round(test_files_count/total_file_count, 4)*100) + "%)")
print("Total files:", total_file_count)

Source Code Files: 14 (40.0%)
Test Code Files: 21 (60.0%)
Total files: 35


In [0]:
# Generate the SP Models
model = gen_sp_model(corpus_df, output='LibEST', model_name='_sp_bpe_modal', cols=['contents'])

In [12]:
corpus_df = add_length_col(corpus_df, 'contents', 'code_length')
stat_metrics(corpus_df, 'code_length')

~~Printing metrics for all data in column [code_length]~~
Min = 178
Max = 149212
Average = 29901.8857
Median = 28111
Standard Deviation = 26290.3033
0.95 of data points fall between 20739.0036 and 39064.7678


In [13]:
print(corpus_df)

             file_name  ... code_length
0                est.c  ...       50650
1    est_server_http.h  ...         332
2    est_server_http.c  ...       58103
3      est_ossl_util.h  ...         214
4         est_server.c  ...       73443
5   est_client_proxy.h  ...         178
6    est_client_http.c  ...       33067
7           est_locl.h  ...        3189
8         est_client.c  ...      149212
9                est.h  ...       26660
10  est_client_proxy.c  ...       16838
11         est_proxy.c  ...       47546
12        est_server.h  ...        1237
13     est_ossl_util.c  ...        8931
14             us903.c  ...       21959
15            us1883.c  ...       34902
16             us897.c  ...       38280
17             us896.c  ...       10526
18            us3512.c  ...       32055
19             us894.c  ...       38371
20             us901.c  ...       26909
21            us1005.c  ...       19276
22             us893.c  ...       34107
23             us900.c  ...       32706


In [0]:
# print(corpus_df['file_name'][2].split('.')[-1])
corpus_df = add_mccabe_metrics(corpus_df, 'contents')

In [25]:
print(corpus_df.head(8)['file_name'])
print(corpus_df.head(8)['complexity'])

0                 est.c
1     est_server_http.h
2     est_server_http.c
3       est_ossl_util.h
4          est_server.c
5    est_client_proxy.h
6     est_client_http.c
7            est_locl.h
Name: file_name, dtype: object
0    174
1      0
2    217
3      0
4    249
5      0
6    191
7      0
Name: complexity, dtype: int64


In [0]:
# for i in range(len(corpus_df)):
#     if corpus_df["file_name"][i].split('.')[-1] == 'h':
#         print(corpus_df['contents'][i])

Collecting lizard
[?25l  Downloading https://files.pythonhosted.org/packages/b4/2c/d6f6a5507cfa685535731428ee3b32b4d7648a5b8c10b68b85de3cdbb649/lizard-1.17.3-py2.py3-none-any.whl (59kB)
[K     |█████▌                          | 10kB 15.9MB/s eta 0:00:01[K     |███████████                     | 20kB 1.8MB/s eta 0:00:01[K     |████████████████▍               | 30kB 2.1MB/s eta 0:00:01[K     |██████████████████████          | 40kB 1.7MB/s eta 0:00:01[K     |███████████████████████████▍    | 51kB 1.9MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.8MB/s 
[?25hInstalling collected packages: lizard
Successfully installed lizard-1.17.3


# Scratch Code (Testing)

In [0]:
print(round(1236721.8237468732, 3))

1236721.824


In [0]:
!pip install lizard

In [0]:
import subprocess
import json
def add_lizard_mccabe_metrics(df, col):
    mccabe = []
    sloc = []
    comments = []
    ratio = []
    for i in range(len(df)):
        if i == 99:
            print("At file #100", end="")
        if ((i + 1) % 200) == 0:
            print(", file #" + str(i + 1), end="")
        ext = df["file_name"][i].split('.')[-1]
        if ext == 'h':
            ext = 'c'
        with open(f"temp_file.{ext}", "w", encoding='UTF-8') as fp:
            fp.write(df[col][i])
        out = subprocess.Popen(['metrics', '-q', '--format=json', f"temp_file.{ext}"], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.STDOUT)
        stdout,stderr = out.communicate()
        metrics_data = json.loads(stdout)
        mccabe.append(metrics_data['files'][f"temp_file.{ext}"]['mccabe'])
        sloc.append(metrics_data['files'][f"temp_file.{ext}"]['sloc'])
        comments.append(metrics_data['files'][f"temp_file.{ext}"]['comments'])
        ratio.append(metrics_data['files'][f"temp_file.{ext}"]['ratio_comment_to_code'])
#         print(mccabe_data['files'][f'temp_file.{LANGUAGE}'].keys())
        os.remove(f"temp_file.{ext}")
    df["complexity"] = mccabe
    df["code_lines"] = sloc
    df["comments"] = comments
    df["ratio_comment_to_code"] = ratio
    return df

In [34]:
!lizard source_code/est.c

  NLOC    CCN   token  PARAM  length  location  
------------------------------------------------
       7      1     35      2      11 est_logger_stderr@9-19@source_code/est.c
      11      2     53      2      16 est_log_msg@27-42@source_code/est.c
      14      3     66      3      24 est_log@46-69@source_code/est.c
      18      2    154      1      23 printStackTrace@73-95@source_code/est.c
      18      5     91      1      31 est_log_backtrace@99-129@source_code/est.c
       3      1      9      1       3 est_get_version@136-138@source_code/est.c
       3      1      9      1       3 est_get_api_level@148-150@source_code/est.c
       7      2     43      1       9 est_log_version@156-164@source_code/est.c
      10      2     46      2      16 est_init_logger@181-196@source_code/est.c
       4      1     11      1       4 est_enable_backtrace@213-216@source_code/est.c
      42      9    202      3      52 est_read_x509_request@239-290@source_code/est.c
      29      5    135     