In [52]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

In [54]:
datasets = [ 
    'poi-1.5.csv', 
    'poi-2.0.csv', 
    'poi-2.5.csv', 
    'poi-3.0.csv',
    'velocity-1.4.csv',
    'velocity-1.5.csv',
    'velocity-1.6.csv',
    'lucene-2.0.csv', 
    'lucene-2.2.csv', 
    'lucene-2.4.csv',
]

In [70]:
def decisionTree(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    dtreeclf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    
    scores = cross_validate(
        estimator=dtreeclf, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def logisticClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    logreg = LogisticRegression(random_state=16)
    
    scores = cross_validate(
        estimator=logreg, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def naiveBayesClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    NBclf = GaussianNB()
    scores = cross_validate(
        estimator=NBclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def randomForest(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    rfclassifier = RandomForestClassifier(random_state=59, n_jobs=-1, max_depth=5,
    n_estimators=100, oob_score=True)
    
    scores = cross_validate(
        estimator=rfclassifier, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def knnClassifier(X, y):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    y_pred = knn.predict(X)
    recall = recall_score(y, y_pred)
    return recall


def svmClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    svmclf = svm.SVC(kernel='linear', C=1, random_state=0)
    scores = cross_validate(
        estimator=svmclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy','precision' , 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

In [71]:
def exp5(dataset):
    df = pd.read_csv("../dataset/" + dataset)
    df.dropna(how='all', inplace=True)
    print(f"Dataset: {dataset} with shape {df.shape}")
    y = df.iloc[:,-1]
    X = df.iloc[:,:-1]
    
    recall_scores = {
        'Decision Tree': decisionTree(X, y),
        'Logistic Regression': logisticClassifier(X, y),
        'Naive Bayes': naiveBayesClassifier(X, y),
        'Random Forest': randomForest(X, y),
        'K-NN': knnClassifeer(X, y),
        'SVM': svmClassifier(X, y)
    }
    
    return recall_scores

In [72]:
# recall_table = pd.DataFrame(columns=['Dataset'] + list(recall_scores.keys()))

for dataset in datasets:
    recall_scores = exp5(dataset)
    print(recall_scores)
    # recall_scores['Dataset'] = dataset
    # recall_table = recall_table.append(recall_scores, ignore_index=True)

# recall_table.set_index('Dataset', inplace=True)
# print(recall_table)

Dataset: poi-1.5.csv with shape (237, 21)
{'Decision Tree': 0.7666666666666666, 'Logistic Regression': 0.8157142857142856, 'Naive Bayes': 0.3109523809523809, 'Random Forest': 0.8509523809523809, 'K-NN': nan, 'SVM': 0.7942857142857143}
Dataset: poi-2.0.csv with shape (314, 22)
{'Decision Tree': 1.0, 'Logistic Regression': 1.0, 'Naive Bayes': 1.0, 'Random Forest': 1.0, 'K-NN': nan, 'SVM': 1.0}
Dataset: poi-2.5.csv with shape (385, 21)
{'Decision Tree': 0.8788333333333334, 'Logistic Regression': 0.8501666666666667, 'Naive Bayes': 0.33566666666666667, 'Random Forest': 0.8793333333333333, 'K-NN': nan, 'SVM': 0.8341666666666667}
Dataset: poi-3.0.csv with shape (442, 22)
{'Decision Tree': 1.0, 'Logistic Regression': 1.0, 'Naive Bayes': 1.0, 'Random Forest': 1.0, 'K-NN': nan, 'SVM': 1.0}
Dataset: velocity-1.4.csv with shape (196, 22)
{'Decision Tree': 1.0, 'Logistic Regression': 1.0, 'Naive Bayes': 1.0, 'Random Forest': 1.0, 'K-NN': nan, 'SVM': 1.0}
Dataset: velocity-1.5.csv with shape (214, 2

In [66]:
df = pd.read_csv("../dataset/" + datasets[0])
df.dropna(how='all', inplace=True)
print(f"Dataset: {datasets[0]} with shape {df.shape}")
y = df.iloc[:,-1]
X = df.iloc[:,:-1]

recall_scores = {
    # 'Decision Tree': decisionTree(X, y),
    # 'Logistic Regression': logisticClassifier(X, y),
    # 'Naive Bayes': naiveBayesClassifier(X, y),
    # 'Random Forest': randomForest(X, y),
    'K-NN': knnClassifeer(X, y),
    # 'SVM': svmClassifier(X, y)
}
print(recall_scores)

Dataset: poi-1.5.csv with shape (237, 21)
{'K-NN': nan}
