In [2]:
from time import perf_counter
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import make_pipeline, Pipeline
import seaborn as sns,matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')

from library.configs import CLFS, IMBS, CV, SCORERS
from library.utils import evaluate, read_data

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [4]:
NOISE_TYPES = ['No','FN-Only','FP-Only','Both']
CV = RepeatedStratifiedKFold(n_splits=5,n_repeats=2,random_state=99)

In [5]:
models = {}
for im,samp in IMBS.items():
    for c,clf in CLFS.items():
        models[(im,c)] = Pipeline([('samp',samp),('clf',clf)])
models.keys(),len(models)

(dict_keys([('smote', 'dt'), ('smote', 'lr'), ('smote', 'nb'), ('smote', 'svm'), ('smote', 'knn'), ('smote', 'rf'), ('rus', 'dt'), ('rus', 'lr'), ('rus', 'nb'), ('rus', 'svm'), ('rus', 'knn'), ('rus', 'rf'), ('wilson', 'dt'), ('wilson', 'lr'), ('wilson', 'nb'), ('wilson', 'svm'), ('wilson', 'knn'), ('wilson', 'rf'), ('tomek', 'dt'), ('tomek', 'lr'), ('tomek', 'nb'), ('tomek', 'svm'), ('tomek', 'knn'), ('tomek', 'rf'), ('None', 'dt'), ('None', 'lr'), ('None', 'nb'), ('None', 'svm'), ('None', 'knn'), ('None', 'rf')]),
 30)

In [None]:
from sklearn.metrics import confusion_matrix
FRACS = [0,.25,.5,.75,1.0]
cols = pd.MultiIndex.from_product([FRACS,IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['frac','imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
for it,d in enumerate(DATASETS):
    print(it)
    X,y_noisy,y_real = read_data(d,stats=True)
    C = np.argwhere(y_real==1).ravel()
    N = np.argwhere(y_noisy==0).ravel()  #PN
    idx = np.intersect1d(N,C)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    print("idx:",len(idx),fn,len(N),len(C))
    for frac in FRACS:
        print("frac:",frac)
        y = y_real.copy()
        size = int(frac*len(idx))
        to_pollute = np.random.choice(idx.copy(),size=size,replace=False)
        y[to_pollute] = 0
        print(len(idx),len(to_pollute))
        print(f"{precision_score(y_real,y):.3f},{recall_score(y_real,y):.3f}",(y_real!=y).sum())
        
        for k in models:
            r = evaluate(models[k],X,y,y_real,CV,SCORERS)
            for f in r:
                df.loc[d,(frac,k[0],k[1],f)] = r[f].mean()
    df.to_csv("PN10.csv")
    print()

In [None]:
from sklearn.metrics import confusion_matrix
FRACS = [0,.25,.5,.75,1.0]
cols = pd.MultiIndex.from_product([FRACS,IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['frac','imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
for it,d in enumerate(DATASETS):
    X,y_noisy,y_real = read_data(d,stats=True)
    C = np.argwhere(y_real==0).ravel()
    N = np.argwhere(y_noisy==1).ravel()  #NP
    idx = np.intersect1d(N,C)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    print("idx:",len(idx),fn,len(N),len(C))
    for frac in FRACS:
        print("frac:",frac)
        y = y_real.copy()
        size = int(frac*len(idx))
        to_pollute = np.random.choice(idx.copy(),size=size,replace=False)
        y[to_pollute] = 1
        print(len(idx),len(to_pollute))
        print(f"{precision_score(y_real,y):.3f},{recall_score(y_real,y):.3f}",(y_real!=y).sum())
        
        for k in models:
            r = evaluate(models[k],X,y,y_real,CV,SCORERS)
            for f in r:
                df.loc[d,(frac,k[0],k[1],f)] = r[f].mean()
    df.to_csv("NP10.csv")

## Why is Under-sampling Bad?

In [6]:
from sklearn.base import BaseEstimator, ClassifierMixin
class FilterPN(BaseEstimator, ClassifierMixin):
    def __init__(self,est):
        self.est = est
    def fit(self,X,y_noisy,y_real):
        C = np.argwhere(y_real==1).ravel()
        N = np.argwhere(y_noisy==0).ravel()  #PN
        idx = np.intersect1d(N,C)
        remaining = np.delete(list(range(len(X))),idx)
        X,y = X[remaining], y_noisy[remaining]
        rus = RandomUnderSampler();rus.fit_resample(X,y)
        X,y = X[rus.sample_indices_],y[rus.sample_indices_]
        self.est.fit(X,y)
        return self
    def predict(self,X):
        return self.est.predict(X)
    
    def predict_proba(self,X):
        return self.est.predict_proba(X)

In [7]:
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
cols= pd.MultiIndex.from_product([IMBS.keys(),CLFS.keys(),[f.__name__ for f in SCORERS]],names=['imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)
df = pd.read_csv("CorrectNeg.csv", header=[0,1,2],index_col=0)
for it,d in enumerate(DATASETS):
    print(it)
    if df.loc[d].isna().sum()==0:
        print("Skipping:",it)
        continue
    X,y_noisy,y_real = read_data(d,stats=True)
    for k in models:
        clf = FilterPN(models[k])
        r = evaluate(clf,X,y_noisy,y_real,CV,SCORERS,sample_weight=y_real)
        for f in r:
            df.loc[d,(k[0],k[1],f)] = r[f].mean()
    df.to_csv("CorrectNeg.csv")
    print()

0
Skipping: 0
1
Skipping: 1
2
Skipping: 2
3
Skipping: 3
4
Skipping: 4
5
Skipping: 5
6
Skipping: 6
7
Skipping: 7
8
Skipping: 8
9
Skipping: 9
10
Skipping: 10
11
Skipping: 11
12
Skipping: 12
13
Skipping: 13
14
Skipping: 14
15
Skipping: 15
16
Skipping: 16
17
Skipping: 17
18
Skipping: 18
19
hive-0.12.0.csv noise:0.087, imb:56.870,46,2616, Shape:(2662, 65)

20
hive-0.9.0.csv noise:0.179, imb:25.717,53,1363, Shape:(1416, 65)

21
jruby-1.1.csv noise:0.175, imb:3.540,161,570, Shape:(731, 65)

22
jruby-1.4.0.csv noise:0.190, imb:3.890,200,778, Shape:(978, 65)

23
jruby-1.5.0.csv noise:0.218, imb:3.098,276,855, Shape:(1131, 65)

24
jruby-1.7.0.preview1.csv noise:0.099, imb:8.902,163,1451, Shape:(1614, 65)

25
lucene-2.3.0.csv noise:0.204, imb:4.031,160,645, Shape:(805, 65)

26
lucene-2.9.0.csv noise:0.226, imb:3.921,278,1090, Shape:(1368, 65)

27
lucene-3.0.0.csv noise:0.185, imb:6.037,190,1147, Shape:(1337, 65)

28
lucene-3.1.csv noise:0.120, imb:7.477,331,2475, Shape:(2806, 65)

29
wicket-1.3.0

## Analysis

## Notes
1. Both `pr_rec_score` & `matthews_corrcoef`: As expected, "No" noise performs best, while "Both" worst, in terms of whole avg.
2. Interestingly, "Bug" i.e. (So recall=1.0) always performs better than "Non-Bug" i.e. (So precision=1.0)
3. Avg Rank: pr_rec_score- `[1.74, 1.95, 2.82, 3.48]`, Mathew- `[1.78 , 2.46, 2.59, 3.17]`   (['No','Bug','Non-Bug','Both'] in that serial)
4. So for Mathew, in terms of Avg Rank, Non-bug is better, contradicting point 2

In [None]:
import numpy as np
import pandas as pd,os
from scipy.stats import wilcoxon, friedmanchisquare, rankdata, trim_mean, linregress
import scikit_posthocs as sp
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
PN = pd.read_csv("PN10.csv",header=[0,1,2,3],index_col=0)
NP = pd.read_csv("NP10.csv",header=[0,1,2,3],index_col=0)
clean = pd.read_csv("Results/Clean.csv",header=[0,1,2],index_col=0)
clean = clean.drop(columns=['UBag'],level=0)
clean = clean.loc[PN.index]
clean.shape,PN.shape,NP.shape

In [None]:
METRIC_DROP = ['matthews_corrcoef']
PN = PN.drop(columns=METRIC_DROP,level=3,axis=1).droplevel(3,axis=1)  #pr_rec_score, matthews_corrcoef
NP = NP.drop(columns=METRIC_DROP,level=3,axis=1).droplevel(3,axis=1)
clean = clean.drop(columns=METRIC_DROP,level=2,axis=1).droplevel(2,axis=1)
NP.shape,PN.shape, clean.shape

In [None]:
set(clean.columns.get_level_values(0))

In [None]:
FRACS = [0.,.25,.5,.75,1.0]
pn,np = {},{}
for c in FRACS:
    cpn = PN[str(c)] - clean 
    cnp = NP[str(c)] - clean
    pn[c] = trim_mean(cpn.values.reshape(-1),.05)
    np[c] = trim_mean(cnp.values.reshape(-1),.05)

In [None]:
X = list(np.keys())
metr = 'dAPRC'
np_df = pd.DataFrame({'Noise Level':X,metr:[np[c] for c in X]})
pn_df = pd.DataFrame({'Noise Level':X,metr:[pn[c] for c in X]})
plt.xticks(FRACS)
sns.regplot(x='Noise Level',y=metr,data=np_df,ci=95,scatter=True,label='NP')
fig = sns.regplot(x='Noise Level',y=metr,data=pn_df,ci=95,scatter=True,label='PN')
plt.legend();

In [None]:
form = 'pdf'
fig.get_figure().savefig(f'figures/PN_NP_{metr}.{form}',format=form)

In [None]:
meth = 'spearman'
pg.corr(X,list(np.values()),method=meth)

In [None]:
pg.corr(X,list(pn.values()),method=meth)

In [None]:
linregress(X,list(pn.values()))

In [None]:
linregress(X,list(np.values()))