# Load Modules

In [13]:
#Needed Modules
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

print("Starting Notebook.")

sns.set(font_scale = 1)
sns.set_style("whitegrid")

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

from tld.linktypes import fine_linktype_map

Starting Notebook.


# Repositories

In [7]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
           'JiraEcosystem', 'MariaDB', 'Mojang', 'MongoDB', 
           'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

# 'Mindville'

In [8]:
link_df_dict = {}
for s in SOURCES:
    filename = 'data/processed/links_plus_'+s.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()

    links['mappedtype'] = links['linktype'].map(fine_linktype_map)

    linktypes = (links.mappedtype.value_counts()>=len(links)*0.01).rename_axis('mappedtype').reset_index(name='valid')
    valid_types = set(linktypes[linktypes['valid']==True]['mappedtype'])

    link_df = links[(links["mappedtype"].isin(valid_types))]

    link_df_dict[s] = link_df

In [11]:
link_df_dict['Mojang'].mappedtype.value_counts()

Duplicate    193989
Non-Link      43105
Relate        20377
Name: mappedtype, dtype: int64

## Majority Classifier
Always predicts the majority label.

In [18]:
repos = []
maj_accs = []
maj_f1s = []

for s in SOURCES:
    repos.append(s)
    
    link_df = link_df_dict[s]
    
    majority_label = link_df['mappedtype'].value_counts().index.tolist()[0]
    
    predictions = [majority_label]*len(link_df)
    maj_f1 = f1_score(y_true=link_df["mappedtype"], y_pred=predictions, average='macro')
    maj_acc = accuracy_score(y_true=link_df["mappedtype"], y_pred=predictions)

    maj_f1s.append(maj_f1)
    maj_accs.append(maj_acc)
    
majority_dict = {
            'Repository': repos,
            'Maj. Acc.' : maj_accs,
            'Maj. F1': maj_f1s,
          }

majority_df = pd.DataFrame(majority_dict)   
majority_df

Unnamed: 0,Repository,Maj. Acc.,Maj. F1
0,Apache,0.317582,0.037082
1,Hyperledger,0.353782,0.074665
2,IntelDAOS,0.3949,0.070776
3,JFrog,0.333907,0.071521
4,Jira,0.623825,0.096043
5,JiraEcosystem,0.235094,0.034608
6,MariaDB,0.454485,0.078118
7,Mojang,0.75344,0.286462
8,MongoDB,0.376801,0.060817
9,Qt,0.226175,0.046114


In [19]:
majority_df.mean()

  majority_df.mean()


Maj. Acc.    0.398855
Maj. F1      0.081256
dtype: float64

## Random Classifier
Predicts a label based on its occurence in the dataset.
k is the amount of classes to be predicited,

N is the total number of samples,

n_1, ... , n_k are the number of samples per class, thus n_i is the number of samples for class i. Then,

p_i, the probability of assigning i as label is equal to n_i/N.

We then have the following calculations:

Expected Precision: TP/(TP+FP) = (p_i * n_i)/((p_i * n_i)+((p_i * (N-n_i)) = (n_i/N * n_i)/((n_i/N * n_i)+((n_i/N * (N-n_i)) = n_i/N = p_i

Expected Recall: TP/(TP+FN) = (p_i * n_i)/((p_i * n_i)+(((1-p_i) * N)) = (n_i/N * n_i)/((n_i/N * n_i)+(((1-n_i/N) * n_i)) = n_i/N = p_i

Expected F1-Score = (2\*Expected Precision\*Expected Recall)/(Expected Precision+Expected Recall) = 2 * p_i^2 / 2p_i = p_i


In [22]:
repos = []
ran_accs = []
ran_f1s = []

for s in SOURCES:
    repos.append(s)
    
    link_df = link_df_dict[s]
    
    ran_acc = sum((link_df.mappedtype.value_counts()/len(link_df))**2)
    f1_sum = 0
    for lt in link_df.mappedtype.unique():
        f1_sum += link_df.mappedtype.value_counts().loc[lt]/len(link_df)
    ran_f1 = f1_sum/len(link_df.mappedtype.unique())

    ran_f1s.append(ran_f1)
    ran_accs.append(ran_acc)

random_dict = {
            'Repository': repos,
            'Maj. Acc.' : ran_accs,
            'Maj. F1': ran_f1s,
          }

random_df = pd.DataFrame(random_dict)   
random_df

Unnamed: 0,Repository,Maj. Acc.,Maj. F1
0,Apache,0.198386,0.076923
1,Hyperledger,0.229175,0.142857
2,IntelDAOS,0.250312,0.125
3,JFrog,0.225797,0.142857
4,Jira,0.439577,0.125
5,JiraEcosystem,0.172245,0.090909
6,MariaDB,0.253292,0.125
7,Mojang,0.601964,0.333333
8,MongoDB,0.232483,0.111111
9,Qt,0.153652,0.125


In [21]:
random_df.mean()

  random_df.mean()


Maj. Acc.    0.261691
Maj. F1      0.136586
dtype: float64