In [1]:
import numpy as np
import pandas as pd,os
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedStratifiedKFold, GridSearchCV, ShuffleSplit
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, roc_auc_score
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier
import matplotlib.pyplot as plt
from helpers import RobustKNN
from robust import RobustLSB, WeightedBagging, kDN, robust_kDN

In [2]:
df = pd.read_csv("wicket-1.5.3.csv")
df.shape

(2578, 70)

In [3]:
file = """AvgCyclomatic, AvgCyclomaticModified, AvgCyclomaticStrict, AvgEssential, AvgLine, AvgLineBlank, AvgLineCode, AvgLineComment, CountDeclClass, CountDeclClassMethod, CountDeclClassVariable, CountDeclFunction, CountDeclInstanceMethod,
CountDeclInstanceVariable, CountDeclMethod, CountDeclMethodDefault, CountDeclMethodPrivate, CountDeclMethodProtected,
CountDeclMethodPublic, CountLine, CountLineBlank, CountLineCode, CountLineCodeDecl, CountLineCodeExe, CountLineComment, CountSemicolon, CountStmt, CountStmtDecl, CountStmtExe, MaxCyclomatic, MaxCyclomaticModified, MaxCyclomaticStrict, RatioCommentToCode, SumCyclomatic, SumCyclomaticModified, SumCyclomaticStrict, SumEssential"""
cls = """CountClassBase, CountClassCoupled, CountClassDerived, MaxInheritanceTree, PercentLackOfCohesion"""
meth_prefix = ["CountInput","CountOutput","CountPath","MaxNesting"]

In [4]:
file_metrics = [c.strip() for c in file.split(',')]
cls_metrics = [c.strip() for c in cls.split(',')]
meth_metrics = [c for c in df.columns for m in meth_prefix if m in c]
code_metrics = set(file_metrics) | set(cls_metrics) | set(meth_metrics)
process_metrics = ["COMM","Added_lines","Del_lines","ADEV","DDEV"]
own_metrics = ["OWN_LINE","OWN_COMMIT","MINOR_LINE","MINOR_COMMIT","MAJOR_COMMIT","MAJOR_LINE"]
all_metrics = set(code_metrics) | set(own_metrics) | set(process_metrics)
len(all_metrics)

65

In [5]:
meth_metrics

['CountInput_Max',
 'CountInput_Mean',
 'CountInput_Min',
 'CountOutput_Max',
 'CountOutput_Mean',
 'CountOutput_Min',
 'CountPath_Max',
 'CountPath_Mean',
 'CountPath_Min',
 'MaxNesting_Max',
 'MaxNesting_Mean',
 'MaxNesting_Min']

In [6]:
FP = (df.HeuBug==True) & (df.RealBug==False)
FN = (df.HeuBug==False) & (df.RealBug==True)

In [7]:
tot = (df.HeuBug!=df.RealBug).sum()
FP.sum(),FN.sum(),FP.sum()+FN.sum(),tot,tot/len(df)

(77, 89, 166, 166, 0.0643910007757952)

In [8]:
X = df[all_metrics].values.astype('float32')
y_noisy = df.HeuBug.values.astype('int8')
y_real = df.RealBug.values.astype('int8')
np.unique(y_noisy,return_counts=True)

(array([0, 1], dtype=int8), array([2485,   93]))

## Original Performance

In [9]:
def evaluate(clf,X,y_noisy,y_real,cv):
    scores = defaultdict(list)
    for train_id, test_id in cv.split(X,y_noisy):
        clf = clf.fit(X[train_id],y_noisy[train_id])
        pred = clf.predict(X[test_id])
        scores['auc'].append(roc_auc_score(pred,y_real[test_id]))
        scores['f1'].append(f1_score(pred,y_real[test_id]))
        #print(scores['auc'][-1],scores['f1'][-1])
    scores['auc'] = np.array(scores['auc'])
    scores['f1'] = np.array(scores['f1'])
    return scores['auc'].mean(),scores['auc'].std(),scores['f1'].mean(),scores['f1'].std()

In [13]:
%%time
rf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
cv = RepeatedStratifiedKFold(n_repeats=5,n_splits=3,random_state=42)
print(evaluate(rf,X,y_noisy,y_real,cv))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

## Performance After Imbalance-learning

In [14]:
cv = RepeatedStratifiedKFold(n_repeats=2,n_splits=10,random_state=42)

In [None]:
%%time
bal_rf = BalancedRandomForestClassifier(n_estimators=500,n_jobs=-1)
print(evaluate(bal_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
bal_rf = BalancedRandomForestClassifier(n_estimators=500,sampling_strategy='not majority',n_jobs=-1)
print(evaluate(bal_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
sm_rf = make_pipeline(SMOTE(),RandomForestClassifier(n_estimators=500,n_jobs=-1))
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

## Handle Dataset Noise

In [None]:
%%time
rf = GridSearchCV(RobustKNN(n_estimators=500,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.3))
rob_rf = make_pipeline(SMOTE(),rf)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
wb = GridSearchCV(WeightedBagging(kDN,n_estimators=500,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.33))
rob_wb = make_pipeline(SMOTE(),wb)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
wb = GridSearchCV(WeightedBagging(robust_kDN,n_estimators=500,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.33))
rob_wb = make_pipeline(SMOTE(),wb)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
wb = GridSearchCV(RobustLSB(kDN,n_estimators=500,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.33))
rob_wb = make_pipeline(SMOTE(),wb)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
wb = GridSearchCV(RobustLSB(robust_kDN,n_estimators=500,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.33))
rob_wb = make_pipeline(SMOTE(),wb)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))

In [None]:
%%time
wb = GridSearchCV(RobustLSB(robust_kDN,n_estimators=1000,n_jobs=-1),{'K':[1,5,10,20]},iid=False,cv=ShuffleSplit(n_splits=2,test_size=.33))
rob_wb = make_pipeline(SMOTE(),wb)
print(evaluate(sm_rf,X,y_noisy,y_real,cv))