In [1]:
#Needed Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import re

import os
import sys

from tqdm import tqdm

import json
from pathlib import Path

import matplotlib.pyplot as plt

print("Starting Notebook.")

sns.set(font_scale = 1.25)
sns.set_style("whitegrid")

Starting Notebook.


## Loading Data

In [2]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 
           'JFrog', 'Jira', 'JiraEcosystem', 
           'MariaDB', 'MongoDB', 'Qt', 
           'RedHat', 'Sakai', 'SecondLife', 
           'Sonatype', 'Spring']
# 'Mindville'

CONFIG = ['R_LTvNL', 'R_LTvNLOL', 'R_LTOLvNL']
LT = 'Duplication'

In [3]:
def print_linktypes(SOURCE):
    #Loading Issues
    filename = '../data/crawl/issues_'+SOURCE.lower()+'.csv'
    issues = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')
    
    issue_set = set(issues['issue_id'])
        
    #Loading Links
    filename = '../data/crawl/clean_links_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0)
    
    link_set = set(links['issue_id_1']).union(set(links['issue_id_2']))

    num_dups = len(links[links['linktype']=='Duplicate'])
        
    return len(issues), len(links), len(links.linktype.unique()), round(len(link_set)/len(issue_set), 3), num_dups

In [5]:
overview = pd.DataFrame(columns = ['Project', '#Issues', '#Links', '#Linktypes', '%IssuesWithLinks', '#NumDups'])
j=0
for s in SOURCES:
    i, l, ltu, pi, nd = print_linktypes(s)
    
    if s == 'JiraEcosystem':
        s = 'JiraEco.'
    
    overview.loc[j]=[s, i, l, ltu, pi, nd]
    
    j+=1

In [6]:
overview

Unnamed: 0,Project,#Issues,#Links,#Linktypes,%IssuesWithLinks,#NumDups
0,Apache,970929,242823,21,0.283,24868
1,Hyperledger,27914,16225,8,0.551,634
2,IntelDAOS,5557,3222,10,0.555,117
3,JFrog,14769,3206,11,0.298,639
4,Jira,265343,98122,19,0.477,21350
5,JiraEco.,40602,10911,18,0.328,1721
6,MariaDB,31229,14618,8,0.445,1374
7,MongoDB,90629,37545,13,0.426,6548
8,Qt,140237,35855,8,0.289,3827
9,RedHat,315797,106200,18,0.389,5436


## Loading Model Results

In [None]:
valid_projects = []

for s in SOURCES:
    valid = True
    for c in CONFIG:
        filename = 'results_v1/sccnn_'+s.lower()+'_'+LT+'_'+c+'_metrics.csv'
        metrics_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False)
        
        LT_mets = metrics_df.iloc[0].values.tolist()[1:]
        NL_mets = metrics_df.iloc[1].values.tolist()[1:]
        OL_mets = metrics_df.iloc[2].values.tolist()[1:]
  
        valid = valid and not(np.isnan(LT_mets + OL_mets + NL_mets).any())
          
    if valid:
        valid_projects.append(s)

In [10]:
valid_projects

['Apache',
 'Hyperledger',
 'IntelDAOS',
 'JFrog',
 'Jira',
 'JiraEco.',
 'MariaDB',
 'MongoDB',
 'Qt',
 'RedHat',
 'Sakai',
 'Sonatype',
 'Spring']

In [None]:
def get_tpfptnfn(conf_mat):
    
    tp = conf_mat.loc["DUPLICATION"][1] 
    fn = conf_mat.loc["DUPLICATION"][0] 
    fp = conf_mat.loc["NON-LINKS"][1] 
    tn = conf_mat.loc["NON-LINKS"][0] 
    
    return tp, fp, tn, fn

In [None]:
def get_results(c, trad):
    avg_d_pre = []
    avg_d_rec = []
    avg_d_f1 = []

    avg_nl_pre = []
    avg_nl_rec = []
    avg_nl_f1 = []

    avg_ol_0 = []
    avg_ol_1 = []

    avg_acc = []

    avg_pre = []
    avg_rec = []
    avg_f1 = []

    for s in valid_projects:
#         print(s.upper())
        filename = 'results_v1/sccnn_latenight_'+s.lower()+'_'+LT+'_'+c+'_confmat.csv'
        confmat_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col='Class')
        confmat_df = pd.DataFrame(confmat_df.values, index=['DUPLICATION', 'OTHER-LINKS', 'NON-LINKS'])
#         print(confmat_df.transpose())
        
    #     print(confmat_df)
    #     print(confmat_df.transpose()/confmat_df.sum(axis=1))
    #     print((confmat_df.transpose()/confmat_df.sum(axis=1)).index)

        avg_ol_0.append((confmat_df.transpose()/confmat_df.sum(axis=1)).loc[0]["OTHER-LINKS"])
        avg_ol_1.append((confmat_df.transpose()/confmat_df.sum(axis=1)).loc[1]["OTHER-LINKS"])

#         print("OL 0: "+ str(np.round((confmat_df.transpose()/confmat_df.sum(axis=1)).loc['0']["OTHER-LINKS"], 3)))
#         print("OL 1: "+ str(np.round((confmat_df.transpose()/confmat_df.sum(axis=1)).loc['1']["OTHER-LINKS"], 3)))
        
        if not trad:
            if c == 'R_LTOLvNL':
                new_confmat = [[confmat_df.loc["DUPLICATION"][0]+confmat_df.loc["OTHER-LINKS"][0], 
                                confmat_df.loc["DUPLICATION"][1]+confmat_df.loc["OTHER-LINKS"][1]],
                               [confmat_df.loc["NON-LINKS"][0],
                                confmat_df.loc["NON-LINKS"][1]]]
                new_confmat_df = pd.DataFrame(new_confmat, index=['DUPLICATION', 'NON-LINKS'])
            else:
                new_confmat = [[confmat_df.loc["DUPLICATION"][0], 
                                confmat_df.loc["DUPLICATION"][1]],
                               [confmat_df.loc["OTHER-LINKS"][0]+confmat_df.loc["NON-LINKS"][0],
                                confmat_df.loc["OTHER-LINKS"][1]+confmat_df.loc["NON-LINKS"][1]]]
                new_confmat_df = pd.DataFrame(new_confmat, index=['DUPLICATION', 'NON-LINKS'])
            confmat_df = new_confmat_df

        tp, fp, tn, fn = get_tpfptnfn(confmat_df)
        
        d_pre = tp/(tp+fp)
        d_rec = tp/(tp+fn)
        d_f1 = 2*(d_pre*d_rec)/(d_pre+d_rec)

        avg_d_pre.append(d_pre)
        avg_d_rec.append(d_rec)
        avg_d_f1.append(d_f1)

        nl_pre = tn/(tn+fn)
        nl_rec = tn/(tn+fp)
        nl_f1 = 2*(nl_pre*nl_rec)/(nl_pre+nl_rec)

        avg_nl_pre.append(nl_pre)
        avg_nl_rec.append(nl_rec)
        avg_nl_f1.append(nl_f1)

        pre = (d_pre+nl_pre)/2
        rec = (d_rec+nl_rec)/2
        f1 = 2*(pre*rec)/(pre+rec)
        
        avg_pre.append(pre)
        avg_rec.append(rec)
        avg_f1.append(f1)
        
        acc = (tp+tn)/(tp+fp+fn+tn)

        avg_acc.append(acc)

    print("+++++++++++++++++")
    print("ACC: "+str(round(np.mean(avg_acc),2)))
    print("+++++++++++++++++")
    print("PRE: "+str(round(np.mean(avg_pre),2)))
    print("REC: "+str(round(np.mean(avg_rec),2)))
    print("F1: "+str(round(np.mean(avg_f1),2)))
    print("+++++++++++++++++")
    print("D PRE: "+str(round(np.mean(avg_d_pre),2)))
    print("D REC: "+str(round(np.mean(avg_d_rec),2)))
    print("D F1: "+str(round(np.mean(avg_d_f1),2)))
    print("+++++++++++++++++")
    print("NL PRE: "+str(round(np.mean(avg_nl_pre),2)))
    print("NL REC: "+str(round(np.mean(avg_nl_rec),2)))
    print("NL F1: "+str(round(np.mean(avg_nl_f1),2)))
    print("+++++++++++++++++")
    print("OL 0: "+str(round(np.mean(avg_ol_0),2)))
    print("OL 1: "+str(round(np.mean(avg_ol_1),2)))

    print("OL STD: "+str(round(np.std(avg_ol_0),2)))

    
    res_dict = {
            'ACC' : avg_acc,
            'Pre': avg_pre,
            'Rec': avg_rec,
            'F1': avg_f1,
            'D_Pre': avg_d_pre,
            'D_Rec': avg_d_rec,
            'D_F1': avg_d_f1,
            'NL_Pre': avg_nl_pre,
            'NL_Rec': avg_nl_rec,
            'NL_F1': avg_nl_f1,
            'OL_Corr': avg_ol_0,
          }

    res_data= pd.DataFrame(res_dict, index=[valid_projects])
    
    return res_data

In [None]:
get_results('R_LTvNL', True)

In [None]:
get_results('R_LTvNL', False)

In [None]:
get_results('R_LTvNLOL', True)

In [None]:
get_results('R_LTvNLOL', False)

In [None]:
get_results('R_LTOLvNL', True)

In [None]:
get_results('R_LTOLvNL', False)