In [None]:
%matplotlib widget

In [1]:
import numpy as np,os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from scipy.io import arff
from scipy.stats import spearmanr, pearsonr, linregress

import matplotlib.pyplot as plt, seaborn as sns
from mpl_toolkits.mplot3d import axes3d, Axes3D

from library.utils import evaluate, read_data

## Imbalance & Noise Ratio
+ IR is very for some datasets. Mean, std:14.96,12.39. Top 3: 34,43,56
+ For noise, Mean, std:15.00,7.28
+ Correlation between these two is statistically significant (r = -.52). So when noise is severe, IR isn't, vice-versa. So we don't have to tackle severe version of both at the same time.  
+ Correlation between IR and size is statistically significant (r = +.62). Again, good news. 
+ Very, very weird structure when datasets are plotted in 3d using PCA. No visible structure with tSNE

In [2]:
DATASETS = [f for f in os.listdir("JIRA/") if 'csv' in f]
SHORT = ['groovy-1_5_7.csv','jruby-1.4.0.csv','lucene-2.9.0.csv','jruby-1.7.0.preview1.csv','groovy-1_6_BETA_1.csv',
        'derby-10.2.1.6.csv','wicket-1.5.3.csv','camel-2.9.0.csv','camel-1.4.0.csv','activemq-5.8.0.csv']

In [9]:
df = pd.DataFrame(columns=['size','IR','noise','#bug_n'],index=DATASETS,dtype='float')
tmp = []
for d in DATASETS:
    X,y_noisy,y_real = read_data(d,stats=False)
    tn, fp, fn, tp = confusion_matrix(y_real,y_noisy).ravel()
    print(f"Real:{y_real.sum()}, Heu:{y_noisy.sum()}, Actual % Bugs,Clean: {tp/(tp+fp):.3f},{tn/(tn+fn):.3f}")
    if tp/(tp+fp) > .55:
        print(d,tp/(tp+fp))
        tmp.append(d)
    imb = np.unique(y_noisy,return_counts=True)[1]
    df.loc[d,'size'] = len(X)
    df.loc[d,'IR'] = imb.max()/imb.min()
    df.loc[d,'noise'] = (y_noisy!=y_real).sum()/len(X)
    df.loc[d,'#bug_n'] = y_noisy.sum()
df.to_csv("data_info.csv")

Real:206, Heu:203, Actual % Bugs,Clean: 0.522,0.969
Real:70, Heu:117, Actual % Bugs,Clean: 0.350,0.959
Real:258, Heu:142, Actual % Bugs,Clean: 0.627,0.924
activemq-5.3.0.csv 0.6267605633802817
Real:101, Heu:288, Actual % Bugs,Clean: 0.198,0.968
Real:87, Heu:161, Actual % Bugs,Clean: 0.373,0.953
Real:180, Heu:200, Actual % Bugs,Clean: 0.485,0.893
Real:196, Heu:160, Actual % Bugs,Clean: 0.600,0.845
lucene-2.3.0.csv 0.6
Real:483, Heu:114, Actual % Bugs,Clean: 0.526,0.754
Real:155, Heu:190, Actual % Bugs,Clean: 0.258,0.908
Real:199, Heu:200, Actual % Bugs,Clean: 0.215,0.977
Real:105, Heu:93, Actual % Bugs,Clean: 0.172,0.964
Real:107, Heu:331, Actual % Bugs,Clean: 0.154,0.977
Real:76, Heu:103, Actual % Bugs,Clean: 0.456,0.963
Real:219, Heu:154, Actual % Bugs,Clean: 0.461,0.922
Real:26, Heu:80, Actual % Bugs,Clean: 0.263,0.993
Real:383, Heu:91, Actual % Bugs,Clean: 0.462,0.784
Real:192, Heu:200, Actual % Bugs,Clean: 0.460,0.988
Real:87, Heu:163, Actual % Bugs,Clean: 0.276,0.971
Real:176, Heu

In [8]:
tmp

['activemq-5.3.0.csv',
 'lucene-2.3.0.csv',
 'derby-10.2.1.6.csv',
 'derby-10.3.1.4.csv',
 'activemq-5.0.0.csv',
 'derby-10.5.1.1.csv',
 'hive-0.9.0.csv']

In [None]:
sns.pairplot(df);

In [None]:
pearsonr(df['IR'],df['size']),spearmanr(df['IR'],df['size']),linregress(df['IR'],df['size'])

## Visualization

In [None]:
X,y_noisy,y_real = read_data(DATASETS[15],stats=True)

In [None]:
pca = PCA(n_components=3).fit(X)
Xp = pca.transform(X)
pca.explained_variance_ratio_.sum(),pca.explained_variance_ratio_

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Xp[:,0],Xp[:,1],Xp[:,2],c=y_real);

In [None]:
yp = cross_val_predict(RandomForestClassifier(n_estimators=100),X,y_real,cv=10)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Xp[:,0],Xp[:,1],Xp[:,2],c=yp);

In [None]:
Xs = TSNE(n_components=3,perplexity=10).fit_transform(X)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Xs[:,0],Xs[:,1],Xs[:,2],c=y_real);

In [None]:
path = "defect_pred'13/Original/AEEEM/PDE.arff"
df = pd.DataFrame(arff.loadarff(path)[0])
label = 'class'
enc = LabelEncoder().fit(df[label])
df[label] = enc.transform(df[label])
df.shape,np.unique(df.dtypes,return_counts=True),df.isna().sum().sum()

In [None]:
np.unique(df[label],return_counts=True)

In [None]:
scaled = pd.DataFrame(StandardScaler().fit_transform(df.drop(columns=[label])))
scaled[label] = df[label]
scaled.shape

In [None]:
log = pd.DataFrame(np.log1p(df.drop(columns=[label]).values))
log[label] = df[label]

In [None]:
fig = plt.figure(figsize = (18,20))
ax = fig.gca()
df.hist(ax = ax);

In [None]:
X = log.drop(columns=[label]).values
Y = df[label].values
X,Y = shuffle(X,Y)

In [None]:
dt = GridSearchCV(DecisionTreeClassifier(),{'max_leaf_nodes':[10,50,None]},cv=4,iid=False)
rf = RandomForestClassifier(n_estimators=500)
svm = GridSearchCV(SVC(gamma='scale'),{'C':[0.1, 1, 10]},cv=4,iid=False)
knn = GridSearchCV(KNeighborsClassifier(),{'n_neighbors':[3,5,10,20]},cv=4,iid=False)

In [None]:
cv = RepeatedStratifiedKFold(n_repeats=5,n_splits=10)
res = cross_val_score(knn,X,Y,cv=cv,scoring='f1',n_jobs=-1)
res.mean(),res.std(), res

In [None]:
a = np.arange(15).reshape(5,3)
a[:,:-2]