In [0]:
# default_exp data.exploratory.se.metrics

# Exploration of your data

> This module comprises some of the statistical and inference techniques to describe the inner properties of software data. The submodules might include:
>
> - Descriptive statistics
> - Software Metrics
> - Information Theory
> - Learning Principels Detection (Occams' Razor, Biased data, and Data Snooping)
> - Inference: Probabilistic and Causal

### Specifically in this module

> - Cyclomatic complexity (CYCLO)
> - Coupling Between Objects (CBO)
> - Lack of Cohesion of Methods 5 (LCOM5)
> - Top/Least 20 Occurring Tokens
> - Percentage of redundancy (just Java)
> - How many Type I and Type II clones are in Java datasets?

In [0]:
# #hide
# from nbdev.showdoc import *

In [11]:
!pip install sentencepiece
!pip install metrics
!pip install lizard



In [0]:
# export
# Imports
import pandas as pd
import sentencepiece as sp
from numpy import mean, std
from statistics import median
from scipy.stats import sem, t
import lizard

# TODO: Remove when mongo call is implemented
import os

In [0]:
# TODO: Replace with actual mongo call
def simulate_getting_dataframes_from_mongo():
    corpus_data = {"system": [], "name": [], "ground_truth": [], "contents": []}
    path = "./source_code"
    for file in os.listdir(path):
        corpus_data["system"].append(None)
        corpus_data["name"].append(file)
        corpus_data["ground_truth"].append("src")
        with open (os.path.join(path, file), "r") as f:
            corpus_data['contents'].append(f.read())
    path = "./tests"
    for file in os.listdir(path):
        corpus_data["system"].append(None)
        corpus_data["name"].append(file)
        corpus_data["ground_truth"].append("test")
        with open (os.path.join(path, file), "r") as f:
            corpus_data["contents"].append(f.read())
    corpus_df = pd.DataFrame(data = corpus_data)
    return corpus_df

In [0]:
# export
# TODO: Change based on format of input data from Mongo
def df_to_txt_file(df, output, cols):
    """Converts a dataframe into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output + '_text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output + '_text.txt'

In [0]:
# export
# TODO: Change based on format of input data from Mongo
def gen_sp_model(df, output, model_name, cols=None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output + model_name} --hard_vocab_limit=false --model_type=bpe')
    return output + model_name

In [0]:
# export
def add_length_col(df, col, length_col_name='length'):
    """Adds a length column with the length of entries in the specified column of a dataframe"""
    df[length_col_name] = df[col].astype(str).apply(len)
    return df

In [0]:
# export
def stat_metrics(df, col, data_types=None, conf = 0.95, sig_figs = 4):
    """Computes statistical metrics about the entries in a dataframe column"""
    if data_types != None: 
        df = df[df['ground_truth'].isin(data_types)]
        print(f"~~Printing metrics for {data_types} in column [{col}]~~")
    else:
        print(f"~~Printing metrics for all data in column [{col}]~~")

    print("Min =", round(df[col].min(), sig_figs))
    print("Max =", round(df[col].max(), sig_figs))
    print("Average =", round(df[col].mean(), sig_figs))
    print("Median =", round(median(df[col]), sig_figs))
    print("Standard Deviation =", round(std(df[col]), sig_figs))
    
    n = len(df[col])
    m = mean(df[col])
    std_err = sem(df[col])
    h = std_err * t.ppf((1 + conf) / 2, n - 1)

    start = m - h
    end = m + h
    print(f"{conf} of data points fall between {round(start, sig_figs)} and {round(end, sig_figs)}")

In [0]:
# export
def add_mccabe_metrics(df, col):
    """"""
    ccn = []
    nloc = []
    # comments = []
    # ratio = []
    for i in range(len(df)):
        metrics = lizard.analyze_file.analyze_source_code(df["name"][i], df["contents"][i])
        ccn.append(metrics.CCN)
        nloc.append(metrics.nloc)
        # comments.append
    df["ccn"] = ccn
    df["nloc"] = nloc
    # df["comments"] = comments
    # df["sc_comm_ratio"] = ratio
    return df

# Analysis & Visualization

In [18]:
# Retrieve the data
corpus_df = simulate_getting_dataframes_from_mongo()
src_df = corpus_df.loc[corpus_df["ground_truth"] == 'src']
test_df = corpus_df.loc[corpus_df["ground_truth"] == 'test']

total_file_count = len(corpus_df)
src_files_count = len(src_df)
test_files_count = len(test_df)
print("Source Code Files:", src_files_count, "(" + str(round(src_files_count/total_file_count, 4)*100) + "%)")
print("Test Code Files:", test_files_count, "(" + str(round(test_files_count/total_file_count, 4)*100) + "%)")
print("Total files:", total_file_count)

Source Code Files: 14 (40.0%)
Test Code Files: 21 (60.0%)
Total files: 35


In [0]:
# Generate the SP Models
model = gen_sp_model(corpus_df, output='LibEST', model_name='_sp_bpe_modal', cols=['contents'])

In [20]:
corpus_df = add_length_col(corpus_df, 'contents', 'code_length')
stat_metrics(corpus_df, 'code_length')

~~Printing metrics for all data in column [code_length]~~
Min = 178
Max = 149212
Average = 29901.8857
Median = 28111
Standard Deviation = 26290.3033
0.95 of data points fall between 20739.0036 and 39064.7678


In [0]:
corpus_df = add_mccabe_metrics(corpus_df, 'contents')

In [35]:
stat_metrics(corpus_df, 'ccn')
stat_metrics(corpus_df, 'nloc')

~~Printing metrics for all data in column [ccn]~~
Min = 0
Max = 516
Average = 83.0
Median = 47
Standard Deviation = 104.228
0.95 of data points fall between 46.6737 and 119.3263
~~Printing metrics for all data in column [nloc]~~
Min = 3
Max = 2353
Average = 516.4
Median = 411
Standard Deviation = 451.1161
0.95 of data points fall between 359.1738 and 673.6262


In [37]:
pd.set_option('display.expand_frame_repr', False)
print(corpus_df)

   system                name ground_truth                                           contents  code_length  ccn  nloc
0    None               est.c          src  static void (*est_log_func)(char *, va_list) =...        50650  204  1005
1    None   est_server_http.h          src  // MINGW typedefs pid_t to int. Using #define ...          332    0     3
2    None   est_server_http.c          src  static pthread_t pthread_self (void)\n{\n    r...        58103  302  1203
3    None     est_ossl_util.h          src  int ossl_verify_cb(int ok, X509_STORE_CTX *ctx...          214    0     4
4    None        est_server.c          src  /*\n * This function sends EST specific HTTP e...        73443  274  1348
5    None  est_client_proxy.h          src  tcw_err_t tcw_connect(tcw_sock_t *sock, tcw_op...          178    0     3
6    None   est_client_http.c          src  /*\n** signed long to signed int\n*/\nint curl...        33067  214   710
7    None          est_locl.h          src  LIBEST_TEST_

In [0]:
"https://github.com/priv-kweihmann/multimetric"


In [0]:
"https://books.google.com/books?id=DxuGi5h2-HEC&pg=PA140&lpg=PA140&dq=do+header+files+in+c+have+a+cyclomatic+complexity&source=bl&ots=0WxxjeX9rR&sig=ACfU3U1QnoKnwrsj4YbSQxATeaoYMvBWwg&hl=en&sa=X&ved=2ahUKEwijtqKxhJXoAhUSXa0KHctQD38Q6AEwAnoECAkQAQ#v=onepage&q=do%20header%20files%20in%20c%20have%20a%20cyclomatic%20complexity&f=false"
"The abover link goes to an article discussing cyclomatic complexity and c header files"

# Scratch Code (Testing)

In [0]:
print(round(1236721.8237468732, 3))

1236721.824


In [0]:
!pip install lizard

In [0]:
import subprocess
import json
def add_lizard_mccabe_metrics(df, col):
    mccabe = []
    sloc = []
    comments = []
    ratio = []
    for i in range(len(df)):
        if i == 99:
            print("At file #100", end="")
        if ((i + 1) % 200) == 0:
            print(", file #" + str(i + 1), end="")
        ext = df["file_name"][i].split('.')[-1]
        if ext == 'h':
            ext = 'c'
        with open(f"temp_file.{ext}", "w", encoding='UTF-8') as fp:
            fp.write(df[col][i])
        out = subprocess.Popen(['metrics', '-q', '--format=json', f"temp_file.{ext}"], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.STDOUT)
        stdout,stderr = out.communicate()
        metrics_data = json.loads(stdout)
        mccabe.append(metrics_data['files'][f"temp_file.{ext}"]['mccabe'])
        sloc.append(metrics_data['files'][f"temp_file.{ext}"]['sloc'])
        comments.append(metrics_data['files'][f"temp_file.{ext}"]['comments'])
        ratio.append(metrics_data['files'][f"temp_file.{ext}"]['ratio_comment_to_code'])
#         print(mccabe_data['files'][f'temp_file.{LANGUAGE}'].keys())
        os.remove(f"temp_file.{ext}")
    df["complexity"] = mccabe
    df["code_lines"] = sloc
    df["comments"] = comments
    df["ratio_comment_to_code"] = ratio
    return df

In [0]:
!lizard source_code/est.c

  NLOC    CCN   token  PARAM  length  location  
------------------------------------------------
       7      1     35      2      11 est_logger_stderr@9-19@source_code/est.c
      11      2     53      2      16 est_log_msg@27-42@source_code/est.c
      14      3     66      3      24 est_log@46-69@source_code/est.c
      18      2    154      1      23 printStackTrace@73-95@source_code/est.c
      18      5     91      1      31 est_log_backtrace@99-129@source_code/est.c
       3      1      9      1       3 est_get_version@136-138@source_code/est.c
       3      1      9      1       3 est_get_api_level@148-150@source_code/est.c
       7      2     43      1       9 est_log_version@156-164@source_code/est.c
      10      2     46      2      16 est_init_logger@181-196@source_code/est.c
       4      1     11      1       4 est_enable_backtrace@213-216@source_code/est.c
      42      9    202      3      52 est_read_x509_request@239-290@source_code/est.c
      29      5    135     

In [0]:
!lizard thing.py

  NLOC    CCN   token  PARAM  length  location  
------------------------------------------------
       8      3     35      0      10 func@1-10@thing.py
1 file analyzed.
NLOC    Avg.NLOC  AvgCCN  Avg.token  function_cnt    file
--------------------------------------------------------------
      8       8.0     3.0       35.0         1     thing.py

No thresholds exceeded (cyclomatic_complexity > 15 or length > 1000 or nloc > 1000000 or parameter_count > 100)
------------------------------------------------------------------------------------------
         8       8.0     3.0       35.0        1            0      0.00    0.00


In [0]:
!metrics source_code/est.c

Metrics Summary:
Files                       Language        SLOC Comment McCabe 
----- ------------------------------ ----------- ------- ------ 
    1                         Python           9       0      3 
----- ------------------------------ ----------- ------- ------ 
    1                          Total           9       0      3 


In [0]:
import lizard

In [0]:
i = lizard.analyze_file("source_code/est.c")

In [0]:
print(i.CCN)

204


In [0]:
print(i.__dict__)

{'filename': 'source_code/est.c', 'nloc': 1005, 'function_list': [<lizard.FunctionInfo object at 0x7f137fc5fc18>, <lizard.FunctionInfo object at 0x7f137fc5fd68>, <lizard.FunctionInfo object at 0x7f137fc5fb00>, <lizard.FunctionInfo object at 0x7f137fc5ff28>, <lizard.FunctionInfo object at 0x7f137fc5ff98>, <lizard.FunctionInfo object at 0x7f137fc5ffd0>, <lizard.FunctionInfo object at 0x7f137fc5f978>, <lizard.FunctionInfo object at 0x7f137fc6b898>, <lizard.FunctionInfo object at 0x7f137fc6b080>, <lizard.FunctionInfo object at 0x7f137fc6b0f0>, <lizard.FunctionInfo object at 0x7f137fc6b160>, <lizard.FunctionInfo object at 0x7f137fc6b128>, <lizard.FunctionInfo object at 0x7f137fc6b2b0>, <lizard.FunctionInfo object at 0x7f137fc6b208>, <lizard.FunctionInfo object at 0x7f137fc6b3c8>, <lizard.FunctionInfo object at 0x7f137fc6b470>, <lizard.FunctionInfo object at 0x7f137fc6b550>, <lizard.FunctionInfo object at 0x7f137fc6b5c0>, <lizard.FunctionInfo object at 0x7f137fc6b668>, <lizard.FunctionInfo o

In [0]:
for i in range(len(i.function_list)):
  

SyntaxError: ignored

In [0]:
print(i.function_list[0].__dict__)

{'cyclomatic_complexity': 3, 'nloc': 8, 'token_count': 35, 'name': 'func', 'long_name': 'func( )', 'start_line': 1, 'end_line': 10, 'parameters': [], 'filename': 'thing.py', 'top_nesting_level': 0, 'length': 10, 'fan_in': 0, 'fan_out': 0, 'general_fan_out': 0}


In [0]:
print(len(i.function_list))

37


In [0]:
i

<lizard.FileInformation at 0x7f137fc5fc88>

In [0]:
!metrics thing.py

Metrics Summary:
Files                       Language        SLOC Comment McCabe 
----- ------------------------------ ----------- ------- ------ 
    1                         Python           3      18      1 
----- ------------------------------ ----------- ------- ------ 
    1                          Total           3      18      1 


In [25]:
thing = lizard.analyze_file("source_code/est.c")
print(thing.__dict__)
print(thing.CCN)

{'filename': 'source_code/est.c', 'nloc': 1005, 'function_list': [<lizard.FunctionInfo object at 0x7f5ed8fa9320>, <lizard.FunctionInfo object at 0x7f5ed8fa9f60>, <lizard.FunctionInfo object at 0x7f5ed8fa9518>, <lizard.FunctionInfo object at 0x7f5ed8fa9550>, <lizard.FunctionInfo object at 0x7f5ed8fa9630>, <lizard.FunctionInfo object at 0x7f5ed8fa94a8>, <lizard.FunctionInfo object at 0x7f5ed8fa9710>, <lizard.FunctionInfo object at 0x7f5ed8fa9668>, <lizard.FunctionInfo object at 0x7f5ed8fa95f8>, <lizard.FunctionInfo object at 0x7f5ed8fa9160>, <lizard.FunctionInfo object at 0x7f5ed8fa97f0>, <lizard.FunctionInfo object at 0x7f5ed8fa98d0>, <lizard.FunctionInfo object at 0x7f5ed8fa99e8>, <lizard.FunctionInfo object at 0x7f5ed8fa9860>, <lizard.FunctionInfo object at 0x7f5ed8fa9ac8>, <lizard.FunctionInfo object at 0x7f5ed8fa9b70>, <lizard.FunctionInfo object at 0x7f5ed8fa9c88>, <lizard.FunctionInfo object at 0x7f5ed8fa9cc0>, <lizard.FunctionInfo object at 0x7f5ed8fa9c50>, <lizard.FunctionInfo o

In [26]:
thing.filename

'source_code/est.c'

In [28]:
thing.nloc

1005

In [32]:
print(thing.functions_average)

<bound method FileInformation.functions_average of <lizard.FileInformation object at 0x7f5ed935c978>>


In [0]:
i = lizard.analyze_file.analyze_source_code("AllTests.cpp", "int foo(){}")

In [0]:
i.__dict__

{'filename': 'AllTests.cpp',
 'function_list': [<lizard.FunctionInfo at 0x7f137fbacc18>],
 'nloc': 1,
 'token_count': 6}

In [0]:
i.CCN

1

In [0]:
lizard.analyze_file.analyze_source_code("AllTests.py", "def foo():\n\tif (True): \n\tprint('HI')").CCN

2

In [0]:
import subprocess
import json
def add_mccabe_metrics(df, col):
    mccabe = []
    sloc = []
    comments = []
    ratio = []
    for i in range(len(df)):
        if i == 99:
            print("At file #100", end="")
        if ((i + 1) % 200) == 0:
            print(", file #" + str(i + 1), end="")
        ext = df["file_name"][i].split('.')[-1]
        if ext == 'h':
            ext = 'c'
        with open(f"temp_file.{ext}", "w", encoding='UTF-8') as fp:
            fp.write(df[col][i])
        out = subprocess.Popen(['metrics', '-q', '--format=json', f"temp_file.{ext}"], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.STDOUT)
        stdout,stderr = out.communicate()
        metrics_data = json.loads(stdout)
        mccabe.append(metrics_data['files'][f"temp_file.{ext}"]['mccabe'])
        sloc.append(metrics_data['files'][f"temp_file.{ext}"]['sloc'])
        comments.append(metrics_data['files'][f"temp_file.{ext}"]['comments'])
        ratio.append(metrics_data['files'][f"temp_file.{ext}"]['ratio_comment_to_code'])
#         print(mccabe_data['files'][f'temp_file.{LANGUAGE}'].keys())
        os.remove(f"temp_file.{ext}")
    df["complexity"] = mccabe
    df["code_lines"] = sloc
    df["comments"] = comments
    df["ratio_comment_to_code"] = ratio
    return df