In [2]:
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, power_transform, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, auc, accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.pipeline import make_pipeline
import seaborn as sns,matplotlib.pyplot as plt

from library.utils import evaluate, read_data
from library.cleaners import kDN, ih_prob

In [3]:
DATASETS = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
len(DATASETS)

32

In [None]:
X,y_noisy,y_real = read_data(DATASETS[1],True)

In [None]:
precision = precision_score(y_real,y_noisy)
recall = recall_score(y_real,y_noisy)
precision, recall

In [None]:
n = kDN(X,y_noisy,K=5)
c = kDN(X,y_real,K=5)
n.mean(),c.mean(),'w',n.std(),c.std()

In [None]:
plt.plot(range(len(n)),np.sort(n),'r');

In [None]:
imbs = {
    'smote': SMOTE(k_neighbors=5),
    'rus': RandomUnderSampler('not minority'),
    'wilson':EditedNearestNeighbours(n_neighbors=5),  #Default was 3
}
clfs = {
    'nb': GaussianNB(),
    'rf': RandomForestClassifier(n_estimators=50),
}

models = {}
for im,samp in imbs.items():
    for c,clf in clfs.items():
        models[(im,c)] = make_pipeline(samp,clf)

In [14]:
cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=42)
def pr_rec_score(y,yp):
    prec, rec, _ = precision_recall_curve(y,yp)
    return auc(rec,prec)
scorers = [matthews_corrcoef,pr_rec_score]
NOISE_TYPES = ['No','Bug','Non-Bug','Both']


In [None]:
cols = pd.MultiIndex.from_product([NOISE_TYPES,imbs.keys(),clfs.keys(),[f.__name__ for f in scorers]],names=['noise','imb','clf','metric'])
df = pd.DataFrame(index=DATASETS,columns=cols)

In [None]:
for k in models:
    print(k)
    for d in DATASETS:
        X,y_noisy,y_real = read_data(d,stats=False)
        for noise_type in NOISE_TYPES:
            y = y_noisy.copy()
            if noise_type=='No':
                y = y_real.copy()
            elif noise_type=='Both':
                pass
            elif noise_type=='Bug':  #So recall 1.0
                y[y_real==1] = 1
            elif noise_type=='Non-Bug': #So precision 1.0
                y[y_real==0] = 0
            p = precision_score(y_real,y)
            r = recall_score(y_real,y)
                
            r = evaluate(models[k],X,y,y_real,cv,scorers)
            for f in r:
                df.loc[d,(noise_type,k[0],k[1],f)] = r[f].mean()
            #print(noise_type,df.loc[d,(noise_type,k[0],k[1],slice(None))])
            #print()

In [None]:
df.to_csv("Impact of Types of Noise.csv")

## Analysis

## Notes
1. Both `pr_rec_score` & `matthews_corrcoef`: As expected, "No" noise performs best, while "Both" worst, in terms of whole avg.
2. Interestingly, "Bug" i.e. (So recall=1.0) always performs better than "Non-Bug" i.e. (So precision=1.0)
3. Avg Rank: pr_rec_score- `[1.74, 1.95, 2.82, 3.48]`, Mathew- `[1.78 , 2.46, 2.59, 3.17]`   (['No','Bug','Non-Bug','Both'] in that serial)
4. So for Mathew, in terms of Avg Rank, Non-bug is better, contradicting point 2

In [4]:
import numpy as np
import pandas as pd,os
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
import scikit_posthocs as sp

In [11]:
df = pd.read_csv("Impact of Types of Noise.csv",header=[0,1,2,3],index_col=0)
df.isna().sum().sum()

0

In [20]:
tmp = df.drop(columns=['matthews_corrcoef'],level=3,axis=1).droplevel(3,axis=1)
tmp

noise,No,No,No,No,No,No,Bug,Bug,Bug,Bug,...,Non-Bug,Non-Bug,Non-Bug,Non-Bug,Both,Both,Both,Both,Both,Both
imb,smote,smote,rus,rus,wilson,wilson,smote,smote,rus,rus,...,rus,rus,wilson,wilson,smote,smote,rus,rus,wilson,wilson
clf,nb,rf,nb,rf,nb,rf,nb,rf,nb,rf,...,nb,rf,nb,rf,nb,rf,nb,rf,nb,rf
activemq-5.8.0.csv,0.395876,0.322205,0.393498,0.299466,0.422668,0.370918,0.397364,0.305907,0.3998,0.311946,...,0.380137,0.307914,0.394135,0.360081,0.394758,0.277383,0.390946,0.293403,0.41389,0.330305
groovy-1_6_BETA_1.csv,0.419579,0.623572,0.409148,0.507401,0.416746,0.561341,0.422555,0.532286,0.412621,0.510249,...,0.377759,0.501343,0.427294,0.593162,0.415522,0.414697,0.385841,0.477746,0.403195,0.478058
activemq-5.3.0.csv,0.466876,0.499462,0.463518,0.463055,0.478188,0.4787,0.469061,0.494934,0.462018,0.458214,...,0.450472,0.44243,0.464267,0.499088,0.45449,0.470456,0.447931,0.423382,0.464425,0.502056
wicket-1.3.0-incubating-beta-1.csv,0.448159,0.452276,0.420954,0.439376,0.465441,0.513826,0.448219,0.34606,0.458267,0.351514,...,0.402573,0.448893,0.437103,0.510831,0.446318,0.305458,0.44326,0.298553,0.488007,0.332677
jruby-1.1.csv,0.628827,0.6832,0.622085,0.68237,0.617425,0.69677,0.625469,0.629403,0.619495,0.629065,...,0.647244,0.660948,0.623833,0.704163,0.633782,0.574448,0.621708,0.558252,0.565624,0.619919
jruby-1.4.0.csv,0.606393,0.632672,0.603336,0.591495,0.598154,0.639718,0.603682,0.613493,0.590694,0.605625,...,0.599948,0.61974,0.590843,0.612786,0.613928,0.558533,0.606308,0.569491,0.604341,0.610999
lucene-2.3.0.csv,0.735496,0.891819,0.696705,0.863578,0.723203,0.842385,0.681853,0.874443,0.666622,0.859787,...,0.668665,0.817607,0.689206,0.823006,0.618986,0.746091,0.586611,0.702735,0.611685,0.722452
hbase-0.95.2.csv,0.528984,0.576269,0.524307,0.580407,0.546821,0.578973,0.529158,0.56668,0.525008,0.573902,...,0.514473,0.521795,0.519988,0.51505,0.518057,0.448615,0.51435,0.476605,0.51536,0.478108
lucene-3.0.0.csv,0.335744,0.547999,0.333666,0.473437,0.373376,0.482133,0.32706,0.421296,0.318986,0.390199,...,0.350614,0.404012,0.381171,0.420267,0.301629,0.284711,0.311482,0.321902,0.348917,0.315272
camel-2.9.0.csv,0.359455,0.281933,0.354588,0.240935,0.363022,0.275444,0.365244,0.254994,0.364441,0.22031,...,0.34653,0.208159,0.355141,0.234482,0.357613,0.175531,0.360169,0.190442,0.362684,0.239732


In [21]:
dic = {}
for noise_type in NOISE_TYPES:
    dic[noise_type] = tmp[noise_type].values.reshape(-1)
    print(noise_type,dic[noise_type].mean())

No 0.4964820945632966
Bug 0.48525796620882905
Non-Bug 0.4615927690560883
Both 0.4367618136000903


In [22]:
friedmanchisquare(*(dic[a] for a in dic))

FriedmanchisquareResult(statistic=223.41250000000036, pvalue=3.6729655381200596e-48)

In [23]:
res = np.concatenate([dic[ntype].reshape(-1,1) for ntype in NOISE_TYPES],axis=1)
res.shape

(192, 4)

In [24]:
ranks = res.copy()
for i in range(res.shape[0]):
    ranks[i] = rankdata(1-res[i])

In [25]:
ranks.mean(axis=0)

array([1.74479167, 1.953125  , 2.81770833, 3.484375  ])