In [1]:
import pickle as pkl
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, r2_score
from datetime import timedelta
import time
import csv
from sklearnex import patch_sklearn
patch_sklearn()

X_train = pkl.load(open('CleanDataset/X_train.pkl', 'rb'))
y_train = pkl.load(open('CleanDataset/y_train.pkl', 'rb'))

def test_model(model, X_train=X_train, y_train=y_train):
    cv = KFold(n_splits = 4, shuffle=True, random_state=1)
    acc = make_scorer(accuracy_score)
    prc = make_scorer(precision_score)
    rec = make_scorer(recall_score)
    auc = make_scorer(roc_auc_score)
    f1 = make_scorer(f1_score)
    r2 = make_scorer(r2_score)

    acc_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring=acc)
    prc_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring=prc)
    rec_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring=rec)
    auc_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring=auc)
    f1_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring =f1)
    r2_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring=r2)
    score = [acc_val_score.mean(), prc_val_score.mean(), rec_val_score.mean(), auc_val_score.mean(),\
             f1_val_score.mean(), r2_val_score.mean()]
    return score

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def insert_csv(model_name, result, duration):
    msg = [{'Model':f'{model_name}','Training time': f"{duration}",'Accuracy': f"{round(result[0],4)*100}", \
    'Precision': f'{round(result[1],4)*100}', 'Recall': f'{round(result[2],4)*100}', 'AUC': f'{round(result[3],4)*100}',\
    'F1': f'{round(result[4],4)*100}', 'R2': f'{round(result[5],3)}'}]

    with open('record_result', 'a', newline='\n') as csvfile:
        fieldnames = ['Model', 'Training time', 'Accuracy', 'Precision', 'Recall', 'AUC', 'F1', 'R2']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerows(msg)

In [3]:
X_train

array([[ 0.8948647 ,  0.50373453, -0.59467758, ..., -1.55668752,
         0.42370398,  0.16459136],
       [-1.11748736, -1.16937726,  1.6815835 , ..., -1.55668752,
         0.42370398,  0.16459136],
       [ 0.8948647 ,  0.92201247, -0.59467758, ...,  0.64238968,
        -1.34400414,  0.16459136],
       ...,
       [ 0.8948647 ,  1.41000341, -0.59467758, ...,  0.64238968,
        -1.34400414, -6.07565293],
       [-1.11748736, -0.33282137, -0.59467758, ...,  0.64238968,
        -0.46015008,  0.16459136],
       [ 0.8948647 ,  0.01574359, -0.59467758, ...,  0.64238968,
        -0.46015008,  0.16459136]])

## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression as LogReg
LR = LogReg()
start = time.perf_counter()
ResultLogReg = test_model(LR)
print(ResultLogReg)
duration = timedelta(seconds=time.perf_counter()-start)

insert_csv('Logistic Regression', ResultLogReg, duration)

[0.7844819563815715, 0.7070601203312665, 0.9714277227548006, 0.7844817205135293, 0.8184229395178693, 0.13791040809997762]


## Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200)
start = time.perf_counter()
ResultRF = test_model(clf)
print(ResultRF)
duration = timedelta(seconds=time.perf_counter()-start)

insert_csv('Random Forest', ResultRF, duration)

[0.8133371221940827, 0.7567761887540185, 0.923143471889857, 0.8131861205247757, 0.8316842208990151, 0.25316722483054677]


## XGBoost

In [6]:
import xgboost
xgb = xgboost.XGBClassifier()
start = time.perf_counter()
ResultXGB = test_model(xgb)
print(ResultXGB)
duration = timedelta(seconds=time.perf_counter()-start)

insert_csv('XGBoost', ResultXGB, duration)

[0.8057294886367412, 0.7460462352778905, 0.9270041284841634, 0.8057287023792248, 0.8267378479143874, 0.22290225319865128]


## Support Vector Machine (SVM)

In [7]:
from sklearn.svm import SVC
svc = SVC()
start = time.perf_counter()
ResultSVM = test_model(svc)
print(ResultSVM)
duration = timedelta(seconds=time.perf_counter()-start)

insert_csv('SVM', ResultSVM, duration)

## Sklearn MLP

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=None, max_iter=75, activation='logistic', learning_rate='adaptive', early_stopping=True)
start = time.perf_counter()
ResultMLP = test_model(mlp)
print(ResultMLP)
duration = timedelta(seconds=time.perf_counter()-start)

insert_csv('MLP', ResultMLP, duration)