In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, roc_auc_score, auc, plot_roc_curve, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold

In [2]:
inner_fold = 4
outer_fold = 5

In [3]:
data = pd.read_csv("./../result/data_created.csv")
groups = data.PID
X = data.loc[:, ["mvm", "sdvm", "df", "p625", "fpdf", "mangle", "sdangle"]]
y = data["ActivityNumber"]

In [4]:
# nested cross validation for Random Forest model
# nested cross validation for Random Forest model
# configure the cross-validation procedure
inner_cv = GroupKFold(n_splits=inner_fold)
outer_cv = GroupKFold(n_splits=outer_fold)

# define the model
model = RandomForestClassifier(random_state=1)
params = {
    'n_estimators' : [50, 100, 150, 200],
    'max_features' : ["auto", "sqrt", "log2"],
    'min_samples_leaf' : [1, 3, 5]
}

acc = []
f1 = []
precision = []
recall = []
auc_score = []


for train_index, test_index in outer_cv.split(X, y, groups=groups):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    grid = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=inner_cv,
                        scoring="accuracy",
                        refit=True,
                        verbose=1,
                        n_jobs=-1)
    
    grid.fit(x_train, y_train, groups=groups[train_index])
    prediction = grid.predict(x_test)
    
    _acc = accuracy_score(y_test, prediction)
    _f1 = f1_score(y_test, prediction, average="macro")
    _precision = precision_score(y_test, prediction, average="macro")
    _recall = recall_score(y_test, prediction, average="macro")
    #_auc = roc_auc_score(y_test, prediction, multi_class="ovr", average="macro")
    
    acc.append(_acc)
    f1.append(_f1)
    precision.append(_precision)
    recall.append(_recall)
    #auc_score.append(_auc)

print("Average accuracy on test set: ", np.mean(acc))
print("Average F1-Score on test set: ", np.mean(f1))
print("Average precision on test set: ", np.mean(precision))
print("Average recall on test set: ", np.mean(recall))
#print("Average auc on test set: ", np.mean(auc_score))

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   40.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   27.2s finished


Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   24.8s finished


Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   28.2s finished


Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   31.4s finished


Average accuracy on test set:  0.7924388185654009
Average F1-Score on test set:  0.7841471636703927
Average precision on test set:  0.806282801477848
Average recall on test set:  0.7858590576090577


In [5]:
# Logistic Regression
# configure the cross-validation procedure
inner_cv = GroupKFold(n_splits=inner_fold)
outer_cv = GroupKFold(n_splits=outer_fold)

model = Pipeline([('normalizer', StandardScaler()),  # normalize data
                  ('clf', LogisticRegression(random_state=1))  # fit Logistic regression model
])


params = {
    'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear'],
    'clf__penalty' : ["l2"],
    'clf__C' : [100, 10, 1.0, 0.1, 0.01]
}

acc = []
f1 = []
precision = []
recall = []
auc_score = []


for train_index, test_index in outer_cv.split(X, y, groups=groups):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    grid = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=inner_cv,
                        scoring="accuracy",
                        refit=True,
                        verbose=1,
                        n_jobs=-1)
    
    grid.fit(x_train, y_train, groups=groups[train_index])
    prediction = grid.predict(x_test)
    
    _acc = accuracy_score(y_test, prediction)
    _f1 = f1_score(y_test, prediction, average="macro")
    _precision = precision_score(y_test, prediction, average="macro")
    _recall = recall_score(y_test, prediction, average="macro")
    #_auc = roc_auc_score(y_test, prediction, multi_class="ovr", average="macro")
    
    acc.append(_acc)
    f1.append(_f1)
    precision.append(_precision)
    recall.append(_recall)
    #auc_score.append(_auc)

print("Average accuracy on test set: ", np.mean(acc))
print("Average F1-Score on test set: ", np.mean(f1))
print("Average precision on test set: ", np.mean(precision))
print("Average recall on test set: ", np.mean(recall))
#print("Average auc on test set: ", np.mean(auc_score))

Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.8s finished


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    2.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.4s finished


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.6s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.7s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits
Average accuracy on test set:  0.7653670886075951
Average F1-Score on test set:  0.7571441760631814
Average precision on test set:  0.7863828622358033
Average recall on test set:  0.7658183483183483


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.6s finished


In [7]:
# Support Vector Machine
# configure the cross-validation procedure
inner_cv = GroupKFold(n_splits=inner_fold)
outer_cv = GroupKFold(n_splits=outer_fold)

# define the model
model = Pipeline([('normalizer', StandardScaler()),  # normalize data
                  ('clf', SVC(random_state=1))  # fit Logistic regression model
])


params = {
    'clf__kernel' : ['poly', 'rbf', 'sigmoid'],
    'clf__gamma' : ["scale"],
    'clf__C' : [50, 10, 1.0, 0.1, 0.01]
}

acc = []
f1 = []
precision = []
recall = []
auc_score = []


for train_index, test_index in outer_cv.split(X, y, groups=groups):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    grid = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=inner_cv,
                        scoring="accuracy",
                        refit=True,
                        verbose=1,
                        n_jobs=-1)
    
    grid.fit(x_train, y_train, groups=groups[train_index])
    prediction = grid.predict(x_test)
    
    _acc = accuracy_score(y_test, prediction)
    _f1 = f1_score(y_test, prediction, average="macro")
    _precision = precision_score(y_test, prediction, average="macro")
    _recall = recall_score(y_test, prediction, average="macro")
    #_auc = roc_auc_score(y_test, prediction, multi_class="ovr", average="macro")
    
    acc.append(_acc)
    f1.append(_f1)
    precision.append(_precision)
    recall.append(_recall)
    #auc_score.append(_auc)

print("Average accuracy on test set: ", np.mean(acc))
print("Average F1-Score on test set: ", np.mean(f1))
print("Average precision on test set: ", np.mean(precision))
print("Average recall on test set: ", np.mean(recall))
#print("Average auc on test set: ", np.mean(auc_score))

Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 15 candidates, totalling 60 fits
Average accuracy on test set:  0.842295358649789
Average F1-Score on test set:  0.8261132154525281
Average precision on test set:  0.8550501443001443
Average recall on test set:  0.8329662004662005


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    1.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.2s finished


In [9]:
# Decision Tree
# configure the cross-validation procedure
inner_cv = GroupKFold(n_splits=inner_fold)
outer_cv = GroupKFold(n_splits=outer_fold)

# define the model
model = DecisionTreeClassifier(random_state=1)

params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2,4,6,8,10,12],
    'min_samples_leaf' : [6, 8]
}

for train_index, test_index in outer_cv.split(X, y, groups=groups):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    grid = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=inner_cv,
                        scoring="accuracy",
                        refit=True,
                        verbose=1,
                        n_jobs=-1)
    
    grid.fit(x_train, y_train, groups=groups[train_index])
    prediction = grid.predict(x_test)
    
    _acc = accuracy_score(y_test, prediction)
    _f1 = f1_score(y_test, prediction, average="macro")
    _precision = precision_score(y_test, prediction, average="macro")
    _recall = recall_score(y_test, prediction, average="macro")
    #_auc = roc_auc_score(y_test, prediction, multi_class="ovr", average="macro")
    
    acc.append(_acc)
    f1.append(_f1)
    precision.append(_precision)
    recall.append(_recall)
    #auc_score.append(_auc)

print("Average accuracy on test set: ", np.mean(acc))
print("Average F1-Score on test set: ", np.mean(f1))
print("Average precision on test set: ", np.mean(precision))
print("Average recall on test set: ", np.mean(recall))
#print("Average auc on test set: ", np.mean(auc_score))

Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 24 candidates, totalling 96 fits
Average accuracy on test set:  0.7409742999616419
Average F1-Score on test set:  0.7223441249719699
Average precision on test set:  0.7537542634133542
Average recall on test set:  0.7323269911906276


[Parallel(n_jobs=-1)]: Done  89 out of  96 | elapsed:    0.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.7s finished
  _warn_prf(average, modifier, msg_start, len(result))
