In [10]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import  precision_score
from sklearn.metrics import  f1_score
from statistics import mean
import random

In [11]:
# HUMAN LABELED TEST SET
input_file1 = 'data/samsung_cmp_test_human.csv'
human_test = pd.read_csv(input_file1, error_bad_lines=False)

input_file2 = 'data/samsung_cmp_test.csv'
test = pd.read_csv(input_file2, error_bad_lines=False)

base_df = pd.read_csv("data/new_data_processed.csv")

# Expert Labeler Performance

In [12]:
Y_test = (test[['label']].values.T[0]==3)*1
Y_manual = (human_test[['label']].values.T[0]==3)*1
pred = Y_manual

In [13]:
precision=precision_score(Y_test,pred, pos_label=1) #average="micro")
recall = recall_score(Y_test, pred, pos_label=1) #average="micro")
f1score = f1_score(Y_test, pred, pos_label=1) #average="micro")
minority_recall = recall_score(Y_test, pred, pos_label=0)
minority_precision = precision_score(Y_test, pred, pos_label=0)
minority_f1 = f1_score(Y_test, pred, pos_label=0)
accuracy=accuracy_score(Y_test,pred)

In [14]:
print("\t Acc  | Useful                | Not Useful")
print("\t A    | P,      R,      F1    | P,      R,      F1")
print("Human:\t", round(accuracy*100, 2), " | ", round(precision*100, 2)," ", round(recall*100, 2)," ", round(f1score*100, 2), " | ", round(minority_precision*100, 2)," ", round(minority_recall*100, 2)," ", round(minority_f1*100, 2))

	 Acc  | Useful                | Not Useful
	 A    | P,      R,      F1    | P,      R,      F1
Human:	 61.88  |  86.81   61.72   72.15  |  28.99   62.5   39.6


# Model Performance

In [15]:
feature_names = ['author_file_experience',
 'author_responded',
 'code_word_ratio',
 'dev_commits',
 'gratitude',
 'is_confirmatory_response',
 'is_last_patch',
 'line_change',
 'message_sentiment',
 'num_participate',
 'num_patches',
 'num_prev_comment_same_file',
 'num_review_comment',
 'patch_id',
 'programming_words',
 'question_ratio',
 'readability',
 'rev_commits',
 'review_interval',
 'similarity',
 'status',
 'stop_word_ratio',
 'word_count']

In [16]:
df = pd.read_csv('data/new_data_processed_cmp_rc.csv') # WHOLE DATASET
neat_data = pd.read_csv("data/neat_data_processed.csv")
neat_data['comment_id'] = df['comment_id']
samsung_train_data = pd.read_csv('data/samsung_cmp_train_with_1484.csv') # DATA THAT WAS KEPT SEPARATE FOR TRAININGG
train = pd.merge(neat_data, samsung_train_data['comment_id'], on='comment_id', how='inner')

In [17]:
# WHOLE DATA MINUS TRAIN DATA = TEST DATA
test1 = neat_data[feature_names+['comment_id', 'label']]
test2 = train[feature_names+['comment_id', 'label']]
test = pd.concat([test1, test2])
test = test.drop_duplicates(keep=False)[feature_names+['label']]
train = train[feature_names+['label']]

In [18]:
X_train = train[feature_names]
Y_train = train['label']
X_test = test[feature_names]
Y_test = test['label']

In [20]:
#################[ CLASSIFIERS ]#############


import numpy as np
import pandas as pd
from sklearn import  tree
import  pydotplus

from datetime import datetime

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import  precision_score
from sklearn.metrics import  f1_score
from statistics import mean
import random

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import  RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import  LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier


from imblearn.over_sampling import SMOTE, RandomOverSampler

import warnings
warnings.filterwarnings("ignore")


def normalize_df(df):
    df_norm = (df - df.mean()) / (df.max() - df.min())
    return df_norm

def row_norm(df):
    norm = df/np.sqrt(np.square(df).sum(axis=1))
    return norm

def standardize_df(df):
    a = 1
    df_norm = (df - df.mean(axis=a)) / df.std(axis=a)
    return df_norm

def benchmark_classifier(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    pred = clf.predict(X_test)

    precision=precision_score(Y_test,pred, pos_label=1) #average="micro")
    recall = recall_score(Y_test, pred, pos_label=1) #average="micro")
    f1score = f1_score(Y_test, pred, pos_label=1) #average="micro")
    minority_recall = recall_score(Y_test, pred, pos_label=0)
    minority_precision = precision_score(Y_test, pred, pos_label=0)
    minority_f1 = f1_score(Y_test, pred, pos_label=0)
    accuracy=accuracy_score(Y_test,pred)

    return (precision,recall,f1score,accuracy,minority_f1,minority_recall, minority_precision)

dt = tree.DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=100 )
svm = LinearSVC()
nn  = MLPClassifier(alpha=1e-3, hidden_layer_sizes=(500,5), random_state=1)
xgb = XGBClassifier()
log = LogisticRegression()
#lin = LinearRegression()

clfs = [dt, rf, svm, nn, xgb, log]
clf_names = ['Decision Tree', 'Random Forest', 'SVM', 'Neural Network', 'XGBoost', 'Logistic Regression']

# def run_classifier(clf, iteration, resample, normalize):
sampling_model = SMOTE(random_state=None, k_neighbors=15, sampling_strategy=0.65,  n_jobs=4)

X_train_np=np.array(X_train)
Y_train_np=np.array(Y_train)
X_test_np=np.array(X_test)
Y_test_np=np.array(Y_test)

print("\t Acc  | Useful                | Not Useful")
print("\t A    | P,      R,      F1    | P,      R,      F1")

Precision=[]
Recall=[]
Fmean=[]
Accuracy=[]
Minority_recall = []
Minority_precision = []
Minority_F1 = []
  
for (clf, name) in zip(clfs, clf_names):
    for i in range(20):
        X_resampled, Y_resampled = sampling_model.fit_sample(X_train_np, Y_train_np)
        (precision,recall,f1score,accuracy,minority_f1,
         minority_recall,minority_precision)=benchmark_classifier(clf,X_resampled, Y_resampled,X_test_np,Y_test_np)
        
        Precision.append(precision)
        Recall.append(recall)
        Fmean.append(f1score)
        Accuracy.append(accuracy)
        Minority_recall.append(minority_recall)
        Minority_precision.append(minority_precision)
        Minority_F1.append(minority_f1)
        
    (precision,recall,f1score,accuracy,minority_precision,minority_recall,minority_f1) = (
        mean(Precision),
        mean(Recall),
        mean(Fmean),
        mean(Accuracy),
        mean(Minority_precision ),
        mean(Minority_recall ),
        mean(Minority_F1)
    )
    print(name,"&\t", round(accuracy*100, 2), " & ", round(precision*100, 2)," & ", round(recall*100, 2)," & ", 
          round(f1score*100, 2), " & ", round(minority_precision*100, 2)," & ", round(minority_recall*100, 2)," & ", 
          round(minority_f1*100, 2))


	 Acc  | Useful                | Not Useful
	 A    | P,      R,      F1    | P,      R,      F1
Decision Tree &	 77.94  &  88.82  &  82.85  &  85.71  &  46.32  &  58.28  &  51.47
Random Forest &	 80.89  &  88.42  &  87.62  &  87.93  &  54.2  &  53.98  &  53.32
SVM &	 73.43  &  86.67  &  78.57  &  80.9  &  45.45  &  52.86  &  45.93
Neural Network &	 74.75  &  86.15  &  81.36  &  82.45  &  45.66  &  48.32  &  44.33
XGBoost &	 76.28  &  86.48  &  83.2  &  83.79  &  47.91  &  48.59  &  46.06
Logistic Regression &	 75.46  &  86.44  &  82.06  &  83.33  &  45.79  &  49.04  &  45.34
