In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

import warnings
warnings.filterwarnings("ignore")

In [2]:
datasets = [ 
    'poi-1.5.csv', 
    'poi-2.0.csv', 
    'poi-2.5.csv', 
    'poi-3.0.csv',
    'velocity-1.4.csv',
    'velocity-1.5.csv',
    'velocity-1.6.csv',
    'lucene-2.0.csv', 
    'lucene-2.2.csv', 
    'lucene-2.4.csv',
]

In [3]:
def decisionTree(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    dtreeclf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    
    scores = cross_validate(
        estimator=dtreeclf, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def logisticClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    #print(X)
    # instantiate the model (using the default parameters)
    logreg = LogisticRegression(random_state=16)
    
    scores = cross_validate(
        estimator=logreg, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def naiveBayesClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    NBclf = GaussianNB()
    scores = cross_validate(
        estimator=NBclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'precision', 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def randomForest(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    rfclassifier = RandomForestClassifier(random_state=59, n_jobs=-1, max_depth=5,
    n_estimators=100, oob_score=True)
    
    scores = cross_validate(
        estimator=rfclassifier, # model to evaluate
        X=features, # inputs features
        y=label, # output labels
        cv=10, # how many folds
        # list of model evaluation metrics
        scoring=['accuracy', 'recall'],
    )
    
    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

def knnClassifier(X, y):
    X = np.array(X)
    y = np.array(y)
    
    # Check data contiguity
    if not X.flags.c_contiguous:
        X = np.ascontiguousarray(X)
    if not y.flags.c_contiguous:
        y = np.ascontiguousarray(y)
        
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X, y)
    y_pred = knn.predict(X)
    recall = metrics.recall_score(y, y_pred)
    return recall


def svmClassifier(X,y):
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    X=pd.DataFrame(X)
    features=X
    label=y
    
    # instantiate the model (using the default parameters)
    svmclf = svm.SVC(kernel='linear', C=1, random_state=0)
    scores = cross_validate(
        estimator=svmclf,  # model to evaluate
        X=features,  # input features
        y=label,  # output labels
        cv=10,  # how many folds
        # list of model evaluation metrics
        scoring=['accuracy','precision' , 'recall'],
    )

    scores = pd.DataFrame(scores)
    scores.round(4)
    scores.mean().round(4)
    return scores.mean()['test_recall']

In [4]:
def exp5(dataset):
    df = pd.read_csv("../dataset/" + dataset)
    df.dropna(inplace=True)
    # print(f"Dataset: {dataset} with shape {df.shape}")
    y = df.iloc[:,-1]
    X = df.iloc[:,:-1]
    
    recall_scores = {
        'Decision Tree': decisionTree(X, y),
        'Logistic Regression': logisticClassifier(X, y),
        'Naive Bayes': naiveBayesClassifier(X, y),
        'Random Forest': randomForest(X, y),
        'K-NN': knnClassifier(X, y),
        'SVM': svmClassifier(X, y)
    }
    
    return recall_scores

In [5]:
result_df = pd.DataFrame()

# Iterate over each dataset
for dataset in datasets:
    recall_scores = exp5(dataset)
    df = pd.DataFrame.from_dict(recall_scores, orient='index', columns=[dataset])
    result_df = pd.concat([result_df, df], axis=1)

result_df = result_df.T
print(result_df.to_string())

                  Decision Tree  Logistic Regression  Naive Bayes  Random Forest      K-NN       SVM
poi-1.5.csv            0.766667             0.815714     0.310952       0.850952  0.801418  0.794286
poi-2.0.csv            1.000000             1.000000     1.000000       1.000000  0.135135  1.000000
poi-2.5.csv            0.878833             0.850167     0.335667       0.879333  0.907258  0.834167
poi-3.0.csv            1.000000             1.000000     1.000000       1.000000  0.903915  1.000000
velocity-1.4.csv       1.000000             1.000000     1.000000       1.000000  0.945578  1.000000
velocity-1.5.csv       0.723810             0.866190     0.311429       0.880000  0.901408  0.844762
velocity-1.6.csv       0.546429             0.471429     0.271429       0.523214  0.602564  0.444643
lucene-2.0.csv         1.000000             0.878889     1.000000       1.000000  0.703297  1.000000
lucene-2.2.csv         0.696190             0.756667     0.400952       0.819524  0.868056 

In [6]:
from tabulate import tabulate
print(tabulate(result_df, headers='keys', tablefmt='grid'))

+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
|                  |   Decision Tree |   Logistic Regression |   Naive Bayes |   Random Forest |     K-NN |      SVM |
| poi-1.5.csv      |        0.766667 |              0.815714 |      0.310952 |        0.850952 | 0.801418 | 0.794286 |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-2.0.csv      |        1        |              1        |      1        |        1        | 0.135135 | 1        |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-2.5.csv      |        0.878833 |              0.850167 |      0.335667 |        0.879333 | 0.907258 | 0.834167 |
+------------------+-----------------+-----------------------+---------------+-----------------+----------+----------+
| poi-3.0.csv      |        1        |          

In [7]:
from scipy.stats import mannwhitneyu

uims_df = pd.read_csv("../dataset/UIMS.csv")
# print(uims_df)
dit = uims_df['dit'].tolist()
noc = uims_df['noc'].tolist()

stat, p_value = mannwhitneyu(dit, noc)
print(f"Mann-Whitney U-test statistic: {stat}, p-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("There is a significant difference between DIT and NOC.")
else:
    print("There is no significant difference between DIT and NOCs.")

Mann-Whitney U-test statistic: 1221.0, p-value: 1.6140451335993335e-06
There is a significant difference between DIT and NOC.


In [8]:
from scipy.stats import wilcoxon

logistic_regression = result_df['Logistic Regression'].values
knn = result_df['K-NN'].values

diff = logistic_regression - knn
abs_diff = np.abs(diff)
ranks = pd.Series(abs_diff).rank()

wilcoxon_result_df = pd.DataFrame({
    'Dataset': result_df.index,
    'ML1 Logistic Regression (Recall)': logistic_regression,
    'ML2 K-NN Classifier (Recall)': knn,
    'di': diff,
    'Abs(di)': abs_diff,
    'rank': ranks
})

stat, p_value = wilcoxon(logistic_regression, knn)

print(f"Wilcoxon signed-rank test statistic: {stat}, p-value: {p_value}")
print(wilcoxon_result_df)

Wilcoxon signed-rank test statistic: 21.0, p-value: 0.556640625
            Dataset  ML1 Logistic Regression (Recall)  \
0       poi-1.5.csv                          0.815714   
1       poi-2.0.csv                          1.000000   
2       poi-2.5.csv                          0.850167   
3       poi-3.0.csv                          1.000000   
4  velocity-1.4.csv                          1.000000   
5  velocity-1.5.csv                          0.866190   
6  velocity-1.6.csv                          0.471429   
7    lucene-2.0.csv                          0.878889   
8    lucene-2.2.csv                          0.756667   
9    lucene-2.4.csv                          0.971190   

   ML2 K-NN Classifier (Recall)        di   Abs(di)  rank  
0                      0.801418  0.014296  0.014296   1.0  
1                      0.135135  0.864865  0.864865  10.0  
2                      0.907258 -0.057091  0.057091   4.0  
3                      0.903915  0.096085  0.096085   5.0  
4       

In [10]:
print(tabulate(wilcoxon_result_df, headers='keys', tablefmt='grid'))

+----+------------------+------------------------------------+--------------------------------+------------+-----------+--------+
|    | Dataset          |   ML1 Logistic Regression (Recall) |   ML2 K-NN Classifier (Recall) |         di |   Abs(di) |   rank |
|  0 | poi-1.5.csv      |                           0.815714 |                       0.801418 |  0.0142958 | 0.0142958 |      1 |
+----+------------------+------------------------------------+--------------------------------+------------+-----------+--------+
|  1 | poi-2.0.csv      |                           1        |                       0.135135 |  0.864865  | 0.864865  |     10 |
+----+------------------+------------------------------------+--------------------------------+------------+-----------+--------+
|  2 | poi-2.5.csv      |                           0.850167 |                       0.907258 | -0.0570914 | 0.0570914 |      4 |
+----+------------------+------------------------------------+----------------------------

In [9]:
from scipy.stats import friedmanchisquare

decision_tree = result_df['Decision Tree'].tolist()
logistic_regression = result_df['Logistic Regression'].tolist()
naive_bayes = result_df['Naive Bayes'].tolist()
random_forest = result_df['Random Forest'].tolist()
knn = result_df['K-NN'].tolist()
svm = result_df['SVM'].tolist()

stat, p_value = friedmanchisquare(decision_tree, logistic_regression, naive_bayes, random_forest, knn, svm)
print(f"Friedman test statistic: {stat}, p-value: {p_value}")

alpha = 0.05
if p_value < alpha: 
    print('Reject Null Hypothesis (At least two classifiers have significantly different performance)') 
else: 
    print('Do not Reject Null Hypothesis (There is no difference in the performance among 6 classifiers. )')

Friedman test statistic: 6.962962962962957, p-value: 0.2234094054231663
Do not Reject Null Hypothesis (There is no difference in the performance among 6 classifiers. )
