In [None]:
# default_exp traceability.unsupervised.eda

# Exploratory Data Analysis for Software Traceability [EDA]
> Adapted from CodeSearchNet Challenge

In [None]:
import json

import pandas as pd
from pathlib import Path
pd.set_option('max_colwidth',300)
from pprint import pprint
import re

In [None]:
#hide
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#export
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

'''
Load a list of jsonl.gz files into a pandas DataFrame.

param 1: the list of files to put into the DataFrame
returns: the pandas DataFrame
'''
def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
#export

'''
param 1: The code to check if its a valid register
returns: boolean true if the code is a string, false if it is not
'''
def valid_register(code):
    '''print(code)
    print(type(code))'''
    return type(code) == str

In [None]:
#export
'''
Checks if a column in a Pandas DataFrame is comprised of strings
param 1: Pandas DataFrame to check
param 2: Column within the dataframe to check
returns:  the boolean values for each datum in the column
'''
def get_valid_code_df(code_df, column):
    return code_df[code_df[column].apply(valid_register)]

In [None]:
!pip install fastprogress

Collecting fastprogress
  Downloading fastprogress-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: fastprogress
Successfully installed fastprogress-1.0.0


## Exploratory analysis

In [None]:
# export
# Imports
import dit
import math
import os
import logging

import matplotlib.pyplot as plt
import pandas as pd
import sentencepiece as sp

from collections import Counter
from pathlib import Path
from scipy.stats import sem, t
from statistics import mean, median, stdev
from tqdm.notebook import tqdm

# ds4se
from ds4se.mgmnt.prep.bpe import *
from ds4se.exp.info import *
from ds4se.desc.stats import *

## Descriptive metrics

In [None]:
#export
from ds4se.desc.metrics import *
from ds4se.desc.metrics.java import *
import lizard
import chardet

In [None]:
#export
'''
Adds mccabe metrics to a column of a DataFram
param 1: the DataFrame to modify
param 2: the columnn to modify
returns: the modified dataframe
'''
def add_method_mccabe_metrics_to_code_df(src_code_df, code_column):
    """Computes method level McAbe metrics and adds it as columns in the specified dataframe"""
    #result_df = src_code_df.copy()
    cyclomatic_complexity = []
    nloc = []
    parameter_count = []
    method_name = []
    token_count = []
    valid_indices = []
    
    for index, row in src_code_df.iterrows():
        #print('index{}'.format(index))
        #print('type:{}'.format(type(row[code_column])))
        metrics = lizard.analyze_file.analyze_source_code('java_file.java', row[code_column])
        metrics_obj = metrics.function_list

        valid_indices.append(index)
        cyclomatic_complexity.append(metrics_obj[0].cyclomatic_complexity)
        nloc.append(metrics_obj[0].nloc)
        parameter_count.append(metrics_obj[0].parameter_count)
        method_name.append(metrics_obj[0].name)
        token_count.append(metrics_obj[0].token_count)
    
    src_code_df['cyclomatic_complexity'] = cyclomatic_complexity
    src_code_df['nloc'] = nloc
    src_code_df['parameter_count'] = parameter_count
    src_code_df['method_name'] = method_name
    src_code_df['token_count'] = token_count
    
    return src_code_df

In [None]:
#export
import seaborn as sns
import numpy as np

In [None]:
#export
'''
Generates a heatmap
param 1: x values of data to map
param 2: y values of data to map
param 3: kwargs
'''
def heatmap(x, y, **kwargs):
    if 'color' in kwargs:
        color = kwargs['color']
    else:
        color = [1]*len(x)

    if 'palette' in kwargs:
        palette = kwargs['palette']
        n_colors = len(palette)
    else:
        n_colors = 256 # Use 256 colors for the diverging color palette
        palette = sns.color_palette("Blues", n_colors) 

    if 'color_range' in kwargs:
        color_min, color_max = kwargs['color_range']
    else:
        color_min, color_max = min(color), max(color) # Range of values that will be mapped to the palette, i.e. min and max possible correlation

    def value_to_color(val):
        if color_min == color_max:
            return palette[-1]
        else:
            val_position = float((val - color_min)) / (color_max - color_min) # position of value in the input range, relative to the length of the input range
            val_position = min(max(val_position, 0), 1) # bound the position betwen 0 and 1
            ind = int(val_position * (n_colors - 1)) # target index in the color palette
            return palette[ind]

    if 'size' in kwargs:
        size = kwargs['size']
    else:
        size = [1]*len(x)

    if 'size_range' in kwargs:
        size_min, size_max = kwargs['size_range'][0], kwargs['size_range'][1]
    else:
        size_min, size_max = min(size), max(size)

    size_scale = kwargs.get('size_scale', 500)

    def value_to_size(val):
        if size_min == size_max:
            return 1 * size_scale
        else:
            val_position = (val - size_min) * 0.99 / (size_max - size_min) + 0.01 # position of value in the input range, relative to the length of the input range
            val_position = min(max(val_position, 0), 1) # bound the position betwen 0 and 1
            return val_position * size_scale
    if 'x_order' in kwargs: 
        x_names = [t for t in kwargs['x_order']]
    else:
        x_names = [t for t in sorted(set([v for v in x]))]
    x_to_num = {p[1]:p[0] for p in enumerate(x_names)}

    if 'y_order' in kwargs: 
        y_names = [t for t in kwargs['y_order']]
    else:
        y_names = [t for t in sorted(set([v for v in y]))]
    y_to_num = {p[1]:p[0] for p in enumerate(y_names)}

    plot_grid = plt.GridSpec(1, 15, hspace=0.2, wspace=0.1) # Setup a 1x10 grid
    ax = plt.subplot(plot_grid[:,:-1]) # Use the left 14/15ths of the grid for the main plot

    marker = kwargs.get('marker', 's')

    kwargs_pass_on = {k:v for k,v in kwargs.items() if k not in [
         'color', 'palette', 'color_range', 'size', 'size_range', 'size_scale', 'marker', 'x_order', 'y_order', 'xlabel', 'ylabel'
    ]}

    ax.scatter(
        x=[x_to_num[v] for v in x],
        y=[y_to_num[v] for v in y],
        marker=marker,
        s=[value_to_size(v) for v in size], 
        c=[value_to_color(v) for v in color],
        **kwargs_pass_on
    )
    ax.set_xticks([v for k,v in x_to_num.items()])
    ax.set_xticklabels([k for k in x_to_num], rotation=45, horizontalalignment='right')
    ax.set_yticks([v for k,v in y_to_num.items()])
    ax.set_yticklabels([k for k in y_to_num])

    ax.grid(False, 'major')
    ax.grid(True, 'minor')
    ax.set_xticks([t + 0.5 for t in ax.get_xticks()], minor=True)
    ax.set_yticks([t + 0.5 for t in ax.get_yticks()], minor=True)

    ax.set_xlim([-0.5, max([v for v in x_to_num.values()]) + 0.5])
    ax.set_ylim([-0.5, max([v for v in y_to_num.values()]) + 0.5])
    ax.set_facecolor('#F1F1F1')

    ax.set_xlabel(kwargs.get('xlabel', ''))
    ax.set_ylabel(kwargs.get('ylabel', ''))

    # Add color legend on the right side of the plot
    if color_min < color_max:
        ax = plt.subplot(plot_grid[:,-1]) # Use the rightmost column of the plot

        col_x = [0]*len(palette) # Fixed x coordinate for the bars
        bar_y=np.linspace(color_min, color_max, n_colors) # y coordinates for each of the n_colors bars

        bar_height = bar_y[1] - bar_y[0]
        ax.barh(
            y=bar_y,
            width=[5]*len(palette), # Make bars 5 units wide
            left=col_x, # Make bars start at 0
            height=bar_height,
            color=palette,
            linewidth=0
        )
        ax.set_xlim(1, 2) # Bars are going from 0 to 5, so lets crop the plot somewhere in the middle
        ax.grid(False) # Hide grid
        ax.set_facecolor('white') # Make background white
        ax.set_xticks([]) # Remove horizontal ticks
        ax.set_yticks(np.linspace(min(bar_y), max(bar_y), 3)) # Show vertical ticks for min, middle and max
        ax.yaxis.tick_right() # Show vertical ticks on the right 

    

In [None]:
#export
'''
Generates a correlation matrix plot
param 1: the data to generate the plot from
param 2: the size of the plot
param 3: the marker
'''
def corrplot(data, size_scale=500, marker='s'):
    corr = pd.melt(data.reset_index(), id_vars='index').replace(np.nan, 0)
    corr.columns = ['x', 'y', 'value']
    heatmap(
        corr['x'], corr['y'],
        color=corr['value'], color_range=[-1, 1],
        palette=sns.diverging_palette(20, 220, n=256),
        size=corr['value'].abs(), size_range=[0,1],
        marker=marker,
        x_order=data.columns,
        y_order=data.columns[::-1],
        size_scale=size_scale
    )