In [21]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import make_pipeline, Pipeline
import seaborn as sns,matplotlib.pyplot as plt

from library.configs import CLFS, IMBS, CV, SCORERS
from library.utils import evaluate, read_data

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [8]:
NOISE_TYPES = ['No','FN-Only','FP-Only','Both']
CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=99)

In [9]:
models = {}
for im,samp in IMBS.items():
    for c,clf in CLFS.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',clf)])
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'svm'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'svm'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'svm'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'svm'), ('tomek', 'knn'), ('tomek', 'rf'), ('None', 'dt'), ('None', 'lr'), ('None', 'nb'), ('None', 'svm'), ('None', 'knn'), ('None', 'rf')]),
 30)

In [19]:
cols = pd.MultiIndex.from_product([NOISE_TYPES,IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['noise','imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [None]:
for k in models:
    print(k)
    st = perf_counter()
    for it,d in enumerate(DATASETS):
        print(it)
        X,y_noisy,y_real = read_data(d,stats=True)
        for noise_type in NOISE_TYPES:
            y = y_noisy.copy()
            if noise_type=='No':
                y = y_real.copy()
            elif noise_type=='Both':
                pass
            elif noise_type=='FP-Only':  #So recall 1.0
                y[y_real==1] = 1
            elif noise_type=='FN-Only':  #So precision 1.0
                y[y_real==0] = 0
            p = precision_score(y_real,y)
            r = recall_score(y_real,y)
            print(noise_type,p,r)
                
            r = evaluate(models[k],X,y,y_real,CV,SCORERS)
            for f in r:
                df.loc[d,(noise_type,k[0],k[1],f)] = r[f].mean()
            #print(noise_type,df.loc[d,(noise_type,k[0],k[1],slice(None))],'\n')
        print()
        df.to_csv("FP vs FN Noise Impact.csv")
    print(perf_counter()-st,'\n')

('smote', 'dt')
0
activemq-5.8.0.csv noise:0.058, imb:15.847,203,3217, Shape:(3420, 65)
No 1.0 1.0
FN-Only 1.0 0.5145631067961165
FP-Only 0.6798679867986799 1.0
Both 0.5221674876847291 0.5145631067961165

1
groovy-1_6_BETA_1.csv noise:0.128, imb:6.017,117,704, Shape:(821, 65)
No 1.0 1.0
FN-Only 1.0 0.5857142857142857
FP-Only 0.4794520547945205 1.0
Both 0.3504273504273504 0.5857142857142857

2
activemq-5.3.0.csv noise:0.094, imb:15.669,142,2225, Shape:(2367, 65)
No 1.0 1.0
FN-Only 1.0 0.3449612403100775
FP-Only 0.8295819935691319 1.0
Both 0.6267605633802817 0.3449612403100775

3
wicket-1.3.0-incubating-beta-1.csv noise:0.164, imb:4.806,288,1384, Shape:(1672, 65)
No 1.0 1.0
FN-Only 1.0 0.5643564356435643
FP-Only 0.3042168674698795 1.0
Both 0.19791666666666666 0.5643564356435643

4
jruby-1.1.csv noise:0.175, imb:3.540,161,570, Shape:(731, 65)
No 1.0 1.0
FN-Only 1.0 0.6896551724137931
FP-Only 0.4627659574468085 1.0
Both 0.37267080745341613 0.6896551724137931

5
jruby-1.4.0.csv noise:0.190,

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


FP-Only 0.8301886792452831 1.0
Both 0.3076923076923077 0.09090909090909091

19
camel-2.10.0.csv noise:0.053, imb:24.447,311,7603, Shape:(7914, 65)
No 1.0 1.0
FN-Only 1.0 0.26521739130434785
FP-Only 0.4791666666666667 1.0
Both 0.19614147909967847 0.26521739130434785

20
derby-10.2.1.6.csv noise:0.290, imb:9.906,180,1783, Shape:(1963, 65)
No 1.0 1.0
FN-Only 1.0 0.20574886535552195
FP-Only 0.9375886524822695 1.0
Both 0.7555555555555555 0.20574886535552195

21
jruby-1.5.0.csv noise:0.218, imb:3.098,276,855, Shape:(1131, 65)
No 1.0 1.0
FN-Only 1.0 0.6829268292682927
FP-Only 0.271523178807947 1.0
Both 0.2028985507246377 0.6829268292682927

22
derby-10.3.1.4.csv noise:0.267, imb:13.051,157,2049, Shape:(2206, 65)
No 1.0 1.0
FN-Only 1.0 0.17787742899850523
FP-Only 0.9462517680339463 1.0
Both 0.7579617834394905 0.17787742899850523

23
lucene-2.9.0.csv noise:0.226, imb:3.921,278,1090, Shape:(1368, 65)
No 1.0 1.0
FN-Only 1.0 0.4432234432234432
FP-Only 0.6348837209302326 1.0
Both 0.4352517985611511

## Analysis

## Notes
1. Both `pr_rec_score` & `matthews_corrcoef`: As expected, "No" noise performs best, while "Both" worst, in terms of whole avg.
2. Interestingly, "Bug" i.e. (So recall=1.0) always performs better than "Non-Bug" i.e. (So precision=1.0)
3. Avg Rank: pr_rec_score- `[1.74, 1.95, 2.82, 3.48]`, Mathew- `[1.78 , 2.46, 2.59, 3.17]`   (['No','Bug','Non-Bug','Both'] in that serial)
4. So for Mathew, in terms of Avg Rank, Non-bug is better, contradicting point 2

In [None]:
import numpy as np
import pandas as pd,os
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
import scikit_posthocs as sp

In [None]:
df = pd.read_csv("Impact of Types of Noise.csv",header=[0,1,2,3],index_col=0)
df.isna().sum().sum()

In [None]:
tmp = df.drop(columns=['pr_rec_score'],level=3,axis=1).droplevel(3,axis=1)
tmp

In [None]:
dic = {}
for noise_type in NOISE_TYPES:
    dic[noise_type] = tmp[noise_type].values.reshape(-1)
    print(noise_type,dic[noise_type].mean())

In [None]:
friedmanchisquare(*(dic[a] for a in dic))

In [None]:
res = np.concatenate([dic[ntype].reshape(-1,1) for ntype in NOISE_TYPES],axis=1)
res.shape

In [None]:
ranks = res.copy()
for i in range(res.shape[0]):
    ranks[i] = rankdata(1-res[i])

In [None]:
ranks.mean(axis=0)

In [None]:
dic.keys()

In [None]:
df = pd.DataFrame(dic)
df.columns = ['No','PN','NP','Both']

In [None]:
f = sns.boxplot(data=df);

In [None]:
f.get_figure().savefig("figures/Noise Type.svg",format='svg')