In [22]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

import warnings
warnings.filterwarnings("ignore")

In [23]:
datasets = [ 
    'poi-1.5.csv', 
    'poi-2.0.csv', 
    'poi-2.5.csv', 
    'poi-3.0.csv',
    'velocity-1.4.csv',
    'velocity-1.5.csv',
    'velocity-1.6.csv',
    'lucene-2.0.csv', 
    'lucene-2.2.csv', 
    'lucene-2.4.csv',
]

In [24]:
def decisionTree(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    dtreeclf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    
    scores = cross_validate(
        estimator=dtreeclf, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def logisticClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    logreg = LogisticRegression(random_state=16)
    
    scores = cross_validate(
        estimator=logreg, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def naiveBayesClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    NBclf = GaussianNB()
    scores = cross_validate(
        estimator=NBclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def randomForest(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    rfclassifier = RandomForestClassifier(random_state=59, n_jobs=-1, max_depth=5,
    n_estimators=100, oob_score=True)
    
    scores = cross_validate(
        estimator=rfclassifier, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def knnClassifier(X, y):
    X = np.array(X)
    y = np.array(y)
    
    # Check data contiguity
    if not X.flags.c_contiguous:
        X = np.ascontiguousarray(X)
    if not y.flags.c_contiguous:
        y = np.ascontiguousarray(y)
        
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X, y)
    y_pred = knn.predict(X)
    recall = metrics.recall_score(y, y_pred)
    return recall


def svmClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    svmclf = svm.SVC(kernel='linear', C=1, random_state=0)
    scores = cross_validate(
        estimator=svmclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy','precision' , 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

In [25]:
def exp5(dataset):
    df = pd.read_csv("../dataset/" + dataset)
    df.dropna(inplace=True)
    # print(f"Dataset: {dataset} with shape {df.shape}")
    y = df.iloc[:,-1]
    X = df.iloc[:,:-1]
    
    recall_scores = {
        'Decision Tree': decisionTree(X, y),
        'Logistic Regression': logisticClassifier(X, y),
        'Naive Bayes': naiveBayesClassifier(X, y),
        'Random Forest': randomForest(X, y),
        'K-NN': knnClassifier(X, y),
        'SVM': svmClassifier(X, y)
    }
    
    return recall_scores

In [29]:
result_df = pd.DataFrame()

# Iterate over each dataset
for dataset in datasets:
    recall_scores = exp5(dataset)
    df = pd.DataFrame.from_dict(recall_scores, orient='index', columns=[dataset])
    result_df = pd.concat([result_df, df], axis=1)

result_df = result_df.T
print(result_df.to_string())

                  Decision Tree  Logistic Regression  Naive Bayes  Random Forest      K-NN       SVM
poi-1.5.csv            0.766667             0.815714     0.310952       0.850952  0.801418  0.794286
poi-2.0.csv            1.000000             1.000000     1.000000       1.000000  0.135135  1.000000
poi-2.5.csv            0.887167             0.850167     0.335667       0.879333  0.907258  0.834167
poi-3.0.csv            1.000000             1.000000     1.000000       1.000000  0.903915  1.000000
velocity-1.4.csv       1.000000             1.000000     1.000000       1.000000  0.945578  1.000000
velocity-1.5.csv       0.723810             0.866190     0.311429       0.880000  0.901408  0.844762
velocity-1.6.csv       0.546429             0.471429     0.271429       0.523214  0.602564  0.444643
lucene-2.0.csv         1.000000             0.878889     1.000000       1.000000  0.703297  1.000000
lucene-2.2.csv         0.689524             0.756667     0.400952       0.819524  0.868056 

In [30]:
from tabulate import tabulate
print(tabulate(result_df, headers='keys', tablefmt='grid'))

+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
|                  |   Decision Tree |   Logistic Regression |   Naive Bayes |   Random Forest |     K-NN |      SVM |
| poi-1.5.csv      |        0.766667 |              0.815714 |      0.310952 |        0.850952 | 0.801418 | 0.794286 |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-2.0.csv      |        1        |              1        |      1        |        1        | 0.135135 | 1        |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-2.5.csv      |        0.887167 |              0.850167 |      0.335667 |        0.879333 | 0.907258 | 0.834167 |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-3.0.csv      |        1        |          