In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate, KFold
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt
import time
import numpy as np

### Dataset D

In assignment, label "Caff" has the highest overall accuracy among the 4 models (0.9743). I will use this label to construct dataset D.

In [2]:
columns = [
           "ID", "Age", "Gender", "Education", "Country", 
           "Ethnicity", "Nscore", "Escore", "Oscore", 
           "Ascore", "Cscore", "Impulsive", "SS", 
           "Alcohol", "Amphet", "Amyl", "Benzos", 
           "Caff", "Cannabis", "Choc", "Coke", 
           "Crack", "Ecstasy", "Heroin", "Ketamine", 
           "Legalh", "LSD", "Meth", "Mushrooms", 
           "Nicotine", "Semer", "VSA"
           ]

dataset = pd.read_csv("./drug_data/drug_consumption.data", header = None, names=columns)

In [3]:
dataset_d = dataset[["Age", "Gender", "Education", "Country", 
           "Ethnicity", "Nscore", "Escore", "Oscore", 
           "Ascore", "Cscore", "Impulsive", "SS", "Caff"]]

dataset_d.head()

Unnamed: 0,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,Caff
0,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,CL6
1,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,CL6
2,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148,CL6
3,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,CL5
4,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,CL6


In [4]:
features = dataset_d.iloc[:,:12]

In [5]:
# Ordinal feature: Age, Education, Nscore, Escore, Oscore, Ascore, Cscore, Impulsive, SS
# Nominal feature: Gender, Country, Ethnicity

# gender is binary so we don't need to encode it here
nominal_f = ['Country', 'Ethnicity']
new_columns = ['USA', 'New Zealand', 'Country_Other', 
               'Australia', 'Ireland', 'Canada', 'UK',
               'Black', 'Asian', 'White', 'Mixed-White/Black',
               'Ethnicity_Other', 'Mixed-White/Asian', 'Mixed-Black/Asian'
              ]

# one hot encode the nominal features
one_hot_features = pd.concat([pd.get_dummies(features[col], prefix=col) for col in nominal_f], axis=1)
# rename the columns after one-hot
one_hot_features = one_hot_features.rename(dict(zip(one_hot_features.columns, new_columns)), axis="columns")
one_hot_features.head()

Unnamed: 0,USA,New Zealand,Country_Other,Australia,Ireland,Canada,UK,Black,Asian,White,Mixed-White/Black,Ethnicity_Other,Mixed-White/Asian,Mixed-Black/Asian
0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,1,0,0,0,0


In [6]:
scaler = MinMaxScaler()
ordinal_f = features.drop(nominal_f, axis=1)
norm_ordinal_f = scaler.fit(ordinal_f).transform(ordinal_f)
norm_ordinal_f = pd.DataFrame(norm_ordinal_f, columns=ordinal_f.columns)
norm_ordinal_f.head()

Unnamed: 0,Age,Gender,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS
0,0.409137,1.0,0.537681,0.560562,0.412116,0.43569,0.367654,0.49904,0.428474,0.224398
1,0.246475,0.0,1.0,0.413474,0.796106,0.762567,0.609827,0.479394,0.33792,0.465658
2,0.409137,0.0,0.537681,0.444788,0.622976,0.392939,0.266061,0.35358,0.215401,0.619957
3,0.0,1.0,0.814329,0.492045,0.376883,0.527023,0.585213,0.584415,0.215401,0.224398
4,0.409137,1.0,1.0,0.623275,0.250544,0.456995,0.456454,0.688508,0.428474,0.465658


In [7]:
# concat norminal and ordinal features together
processed_features = pd.concat([norm_ordinal_f, one_hot_features], axis=1)
processed_features.head()

Unnamed: 0,Age,Gender,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,...,Ireland,Canada,UK,Black,Asian,White,Mixed-White/Black,Ethnicity_Other,Mixed-White/Asian,Mixed-Black/Asian
0,0.409137,1.0,0.537681,0.560562,0.412116,0.43569,0.367654,0.49904,0.428474,0.224398,...,0,0,1,0,0,0,0,0,1,0
1,0.246475,0.0,1.0,0.413474,0.796106,0.762567,0.609827,0.479394,0.33792,0.465658,...,0,0,1,0,0,1,0,0,0,0
2,0.409137,0.0,0.537681,0.444788,0.622976,0.392939,0.266061,0.35358,0.215401,0.619957,...,0,0,1,0,0,1,0,0,0,0
3,0.0,1.0,0.814329,0.492045,0.376883,0.527023,0.585213,0.584415,0.215401,0.224398,...,0,0,1,0,0,1,0,0,0,0
4,0.409137,1.0,1.0,0.623275,0.250544,0.456995,0.456454,0.688508,0.428474,0.465658,...,0,0,1,0,0,1,0,0,0,0


In [8]:
dataset_d.iloc[:,12:]

Unnamed: 0,Caff
0,CL6
1,CL6
2,CL6
3,CL5
4,CL6
...,...
1880,CL4
1881,CL5
1882,CL6
1883,CL6


In [9]:
# encode the labels
# 0: CL0 and CL1 (Non user)
# 1: other classes (User)

labels = dataset_d.iloc[:,12:]
labels.loc[(labels.Caff == 'CL0') | (labels.Caff == 'CL1'), "Caff"] = 0
labels.loc[labels.Caff != 0, "Caff"] = 1
labels = labels.astype('int')
labels.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,Caff
0,1
1,1
2,1
3,1
4,1


In [10]:
dataset_d = pd.concat([processed_features, labels], axis=1)
dataset_d.head()

Unnamed: 0,Age,Gender,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,...,Canada,UK,Black,Asian,White,Mixed-White/Black,Ethnicity_Other,Mixed-White/Asian,Mixed-Black/Asian,Caff
0,0.409137,1.0,0.537681,0.560562,0.412116,0.43569,0.367654,0.49904,0.428474,0.224398,...,0,1,0,0,0,0,0,1,0,1
1,0.246475,0.0,1.0,0.413474,0.796106,0.762567,0.609827,0.479394,0.33792,0.465658,...,0,1,0,0,1,0,0,0,0,1
2,0.409137,0.0,0.537681,0.444788,0.622976,0.392939,0.266061,0.35358,0.215401,0.619957,...,0,1,0,0,1,0,0,0,0,1
3,0.0,1.0,0.814329,0.492045,0.376883,0.527023,0.585213,0.584415,0.215401,0.224398,...,0,1,0,0,1,0,0,0,0,1
4,0.409137,1.0,1.0,0.623275,0.250544,0.456995,0.456454,0.688508,0.428474,0.465658,...,0,1,0,0,1,0,0,0,0,1


In [11]:
print("Number of class 0 in train:", len(dataset_d[dataset_d['Caff']==0]))
print("Number of class 1 in train:", len(dataset_d[dataset_d['Caff']==1]))

Number of class 0 in train: 37
Number of class 1 in train: 1848


### Retraining the dataset D with 10-fold CV for the 4 models

In [12]:
# a pipeline and parameters for decision tree
pipe_df = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("Decision_tree", DecisionTreeClassifier(random_state=42))])


param_distributions_df = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "Decision_tree__max_depth": range(20, 100, 2),
    "Decision_tree__min_samples_split": range(2, 10),
}

# a pipeline and parameters for random forest
pipe_rf = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("Random_forest", RandomForestClassifier())])

param_distributions_rf = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "Random_forest__n_estimators": [75, 100, 150],
    "Random_forest__max_depth": range(20, 100, 2),
    "Random_forest__min_samples_split": range(2, 10),
}

# a pipeline and parameters for SVM
pipe_svm = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("SVM", SVC(probability=True))])

param_distributions_svm = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "SVM__C": [1, 0.1, 0.01],
    "SVM__kernel": ["poly", "rbf", "sigmoid"],
    "SVM__degree": range(3, 10) # for poly kernel only
}

# a pipeline and parameters for KNN
pipe_knn = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("KNN", KNeighborsClassifier())])

param_distributions_knn = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "KNN__n_neighbors": range(3, 30),
}

model_names = ["Decision_tree", "Random_forest", "SVM", "KNN"]
model_pipes = [pipe_df, pipe_rf, pipe_svm, pipe_knn]
model_params = [param_distributions_df, param_distributions_rf, param_distributions_svm, param_distributions_knn]

In [13]:
def train(X, y, model_names, model_pipes, model_params, cv=10, acc_list = []):
    
    for i, name in enumerate(model_names):
        start = time.time()
        print(name + ":")
        
        # random search to find the best hyper-parameters
        clf = RandomizedSearchCV(model_pipes[i], model_params[i], n_iter=60, 
                                 random_state=42, cv=cv, scoring="accuracy")
        search = clf.fit(X, y)
        print('Best parameters found:\n', search.best_params_)
        
        # set the best parameters
        model = model_pipes[i].set_params(**search.best_params_)
        # 10-fold CV
        cv_results = cross_validate(model, X, y, cv=cv, scoring=('accuracy'))
        print("Test Accuracy in 10-fold:", cv_results['test_score'])
        print("Average Test Accuracy:", round(np.mean(cv_results['test_score']), 4))
        acc_list.append(round(np.mean(cv_results['test_score']), 4))
        print()
        
        print("Time:", time.time()-start)
    return acc_list

In [14]:
acc_dict = {"dataset_d":[], "DB1": [], "DB2": [], "labor-relations": [], "heart-disease": []} 

In [15]:
# train four models on dataset D with 10-fold CV

acc_dict["dataset_d"] = train(processed_features, labels.to_numpy().reshape(-1), 
      model_names, model_pipes, model_params, cv=10, acc_list=acc_dict["dataset_d"])

Decision_tree:
Best parameters found:
 {'feature_selection__k': 10, 'Decision_tree__min_samples_split': 5, 'Decision_tree__max_depth': 30}
Test Accuracy in 10-fold: [0.96825397 0.96296296 0.96825397 0.97354497 0.97354497 0.9787234
 0.98404255 0.97340426 0.9787234  0.97340426]
Average Test Accuracy: 0.9735

Time: 5.758501291275024
Random_forest:
Best parameters found:
 {'feature_selection__k': 14, 'Random_forest__n_estimators': 75, 'Random_forest__min_samples_split': 7, 'Random_forest__max_depth': 62}
Test Accuracy in 10-fold: [0.97883598 0.97883598 0.97883598 0.97883598 0.97883598 0.98404255
 0.98404255 0.98404255 0.9787234  0.9787234 ]
Average Test Accuracy: 0.9804

Time: 93.53547596931458
SVM:
Best parameters found:
 {'feature_selection__k': 12, 'SVM__kernel': 'poly', 'SVM__degree': 4, 'SVM__C': 1}
Test Accuracy in 10-fold: [0.97883598 0.97883598 0.97883598 0.97883598 0.97883598 0.98404255
 0.98404255 0.98404255 0.9787234  0.9787234 ]
Average Test Accuracy: 0.9804

Time: 47.643024921

### Question 1 and 2: Oversampling (SMOTE) to construct dataset DB1 and retrain

In [16]:
# training function for oversampling with 10-fold CV
def train2(X, y, model_names, model_pipes, model_params, cv=10, acc_list2 = []):
    
    for i, name in enumerate(model_names):
        start = time.time()
        print(name + ":")
        
        # 10-fold CV
        kf = KFold(n_splits=cv)
        acc_list = [] # save the acc for 10 fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            # oversampling the train data
            X_train, y_train = SMOTE().fit_resample(X_train, y_train) 
            
            # random search to find the best hyper-parameters
            clf = RandomizedSearchCV(model_pipes[i], model_params[i], n_iter=60, 
                                     random_state=42, cv=cv, scoring="accuracy")
            
            search = clf.fit(X_train, y_train)
            # set the best parameters
            # because the random search will do k-fold for the training data
            # we train a new model with all train data again with the best parameters
            model = model_pipes[i].set_params(**search.best_params_)
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            acc_list.append(acc)
        print("Average 10-fold Test Accuracy:", round(np.mean(acc_list), 4))
        acc_list2.append(round(np.mean(acc_list), 4))
        print()
        
        print("Time:", time.time()-start)
    return acc_list2

In [17]:
acc_dict["DB1"] = train2(processed_features.to_numpy(), labels.to_numpy().reshape(-1), 
       model_names, model_pipes, model_params, cv=10, acc_list2=acc_dict["DB1"])

Decision_tree:
Average 10-fold Test Accuracy: 0.9549

Time: 34.55508518218994
Random_forest:
Average 10-fold Test Accuracy: 0.973

Time: 772.4431438446045
SVM:
Average 10-fold Test Accuracy: 0.86

Time: 4161.022247076035
KNN:
Average 10-fold Test Accuracy: 0.8886

Time: 67.27720904350281


### Question 3 and 4: Undersampling to construct dataset DB2 and retrain

In [75]:
# training function for undersampling with 10-fold CV
def train3(X, y, model_names, model_pipes, model_params, cv=10, acc_list2 = []):
    
    for i, name in enumerate(model_names):
        start = time.time()
        print(name + ":")
        
        # 10-fold CV
        kf = KFold(n_splits=cv)
        acc_list = [] # save the acc for 10 fold
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            # undersampling the train data
            X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
            
            # random search to find the best hyper-parameters
            clf = RandomizedSearchCV(model_pipes[i], model_params[i], n_iter=60, 
                                     random_state=42, cv=cv, scoring="accuracy")
            search = clf.fit(X_train, y_train)

            # set the best parameters
            # because the random search will do k-fold for the training data
            # we train a new model with all train data again with the best parameters
            model = model_pipes[i].set_params(**search.best_params_)
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            acc_list.append(acc)
        print("Average 10-fold Test Accuracy:", round(np.mean(acc_list), 4))
        acc_list2.append(round(np.mean(acc_list), 4))
        print()
        
        print("Time:", time.time()-start)
    return acc_list2

In [56]:
# train four models 10-fold CV with undersampling
acc_dict["DB2"] = train3(processed_features.to_numpy(), labels.to_numpy().reshape(-1),
       model_names, model_pipes, model_params, cv=10, acc_list2 = acc_dict["DB2"])

Decision_tree:
Average 10-fold Test Accuracy: 0.5348

Time: 23.295156002044678
Random_forest:
Average 10-fold Test Accuracy: 0.6542

Time: 1112.0733408927917
SVM:
Average 10-fold Test Accuracy: 0.6694

Time: 32.32007384300232
KNN:
Average 10-fold Test Accuracy: 0.6417

Time: 26.659832000732422


### Question 5

In [20]:
pipe_mlp = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("MLP", MLPClassifier(max_iter=750, early_stopping=True, n_iter_no_change=20))])

param_distributions_mlp = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "MLP__hidden_layer_sizes": [(16, 16), (16, 32), (32, 64), (75, 100, 75), (50, 100, 150), (100, 100)],
    "MLP__alpha": [0.00001, 0.0001, 0.001],
    "MLP__learning_rate": ['constant','adaptive'],
}

pipe_gb = Pipeline([
    ("feature_selection", SelectKBest(chi2)),
    ("GB", GradientBoostingClassifier())])

param_distributions_gb = {
    "feature_selection__k": range(10, 24), # best-k features to select
    "GB__n_estimators": [75, 100, 150],
    "GB__max_depth": range(1, 10, 2),
    "GB__min_samples_split": range(2, 10),
}

model_names2 = ["MLP", "Gradient Boosting"]
model_pipes2 = [pipe_mlp, pipe_gb]
model_params2 = [param_distributions_mlp, param_distributions_gb]

In [21]:
# train dataset D with MLP and GB
print("dataset D")
acc_dict["dataset_d"] = train(processed_features.to_numpy(), labels.to_numpy().reshape(-1), 
      model_names2, model_pipes2, model_params2, cv=10, acc_list=acc_dict["dataset_d"])

dataset D
MLP:
Best parameters found:
 {'feature_selection__k': 10, 'MLP__learning_rate': 'adaptive', 'MLP__hidden_layer_sizes': (16, 32), 'MLP__alpha': 0.0001}
Test Accuracy in 10-fold: [0.97883598 0.97883598 0.97883598 0.97883598 0.97883598 0.98404255
 0.98404255 0.98404255 0.9787234  0.9787234 ]
Average Test Accuracy: 0.9804

Time: 125.26375341415405
Gradient Boosting:
Best parameters found:
 {'feature_selection__k': 17, 'GB__n_estimators': 150, 'GB__min_samples_split': 6, 'GB__max_depth': 1}
Test Accuracy in 10-fold: [0.97883598 0.97883598 0.97883598 0.97883598 0.97883598 0.98404255
 0.98404255 0.98404255 0.9787234  0.9787234 ]
Average Test Accuracy: 0.9804

Time: 130.89758205413818


In [22]:
# train dataset DB1 (oversampling) with MLP and GB
print("DB1")
acc_dict["DB1"] = train2(processed_features.to_numpy(), labels.to_numpy().reshape(-1), 
      model_names2, model_pipes2, model_params2, cv=10, acc_list2=acc_dict["DB1"])

DB1
MLP:
Average 10-fold Test Accuracy: 0.9428

Time: 5025.268232822418
Gradient Boosting:
Average 10-fold Test Accuracy: 0.9735

Time: 1324.8253610134125


In [59]:
# train dataset DB2 (undersampling) with MLP and GB
print("DB2")
acc_dict["DB2"] = train3(processed_features.to_numpy(), labels.to_numpy().reshape(-1), 
      model_names2, model_pipes2, model_params2, cv=10, acc_list2=acc_dict["DB2"])

DB2
MLP:
Average 10-fold Test Accuracy: 0.6512

Time: 354.50790667533875
Gradient Boosting:
Average 10-fold Test Accuracy: 0.6022

Time: 526.4371571540833


### Question 6 (labor negotiations dataset)

In [24]:
columns = [
           "dur", "wage1.wage", "wage2.wage", "wage3.wage", "cola", 
           "hours.hrs", "pension", "Escore", "stby_pay", 
           "shift_diff", "educ_allw.boolean", "holidays", "vacation", 
           "lngtrm_disabil.boolean", "dntl_ins", "bereavement.boolean", "empl_hplan"
           ]

ln_data1 = pd.read_csv("./labor negotiations/labor-neg.data", header=None, names=columns)
ln_data2 = pd.read_csv("./labor negotiations/labor-neg.test", header=None, names=columns)
ln_data = pd.concat([ln_data1, ln_data2]).reset_index()

In [25]:
ln_data.replace("?", np.nan, inplace=True) # replace "?' to Nan

In [26]:
numeric_features = ln_data[['wage1.wage', 'wage2.wage', 'wage3.wage']].astype(float) # features with continuous values
categorical_features = ln_data[['dur', 'cola', 'hours.hrs', 'pension', 
                                'Escore', 'stby_pay', 'shift_diff', 'educ_allw.boolean',
                                'holidays', 'vacation', 'lngtrm_disabil.boolean', 
                                'dntl_ins', 'bereavement.boolean']].astype("category") #discontinous values

In [27]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
fill_numeric_features = imp.fit_transform(numeric_features)

In [28]:
imp = SimpleImputer(strategy="most_frequent")
fill_categorical_features = imp.fit_transform(categorical_features)

In [29]:
fill_numeric_features = pd.DataFrame(fill_numeric_features, columns=['wage1.wage', 'wage2.wage', 'wage3.wage'])
fill_categorical_features = pd.DataFrame(fill_categorical_features, 
                                         columns=['dur', 'cola', 'hours.hrs', 'pension', 
                                        'Escore', 'stby_pay', 'shift_diff', 'educ_allw.boolean',
                                        'holidays', 'vacation', 'lngtrm_disabil.boolean', 
                                        'dntl_ins', 'bereavement.boolean'])

In [30]:
processed_ln_data = pd.concat([fill_numeric_features, fill_categorical_features, ln_data['empl_hplan']], axis=1)
processed_ln_data.head()

Unnamed: 0,wage1.wage,wage2.wage,wage3.wage,dur,cola,hours.hrs,pension,Escore,stby_pay,shift_diff,educ_allw.boolean,holidays,vacation,lngtrm_disabil.boolean,dntl_ins,bereavement.boolean,empl_hplan
0,5.0,3.971739,3.913333,1,none,40,empl_contr,2,2,no,11,average,yes,half,yes,full,good
1,4.5,5.8,3.913333,2,none,35,ret_allw,2,3,yes,11,below average,yes,full,yes,full,good
2,3.803571,3.971739,3.913333,2,none,38,empl_contr,2,5,no,11,generous,yes,half,yes,half,good
3,3.7,4.0,5.0,3,tc,40,empl_contr,2,3,yes,11,below average,yes,half,yes,full,good
4,4.5,4.5,5.0,3,none,40,empl_contr,2,3,no,12,average,yes,half,yes,half,good


In [31]:
nominal_features = fill_categorical_features[['cola', 'pension', 'shift_diff', 'holidays', 'vacation', 'lngtrm_disabil.boolean', 'dntl_ins', 'bereavement.boolean']]
ordinal_features = fill_categorical_features[['dur', 'hours.hrs', 'Escore', 'stby_pay', 'educ_allw.boolean']]

In [32]:
# turn the nominal" features to one hot vectors
one_hot_nominal_features = pd.get_dummies(nominal_features)
display(one_hot_nominal_features.head())
scaler = MinMaxScaler()
ordinal_numeric = pd.concat([fill_numeric_features, ordinal_features], axis=1)
norm_f = scaler.fit_transform(ordinal_numeric)
norm_f = pd.DataFrame(norm_f, columns=ordinal_numeric.columns)
norm_f.head()

Unnamed: 0,cola_none,cola_tc,cola_tcf,pension_empl_contr,pension_none,pension_ret_allw,shift_diff_no,shift_diff_yes,holidays_average,holidays_below average,...,vacation_no,vacation_yes,lngtrm_disabil.boolean_full,lngtrm_disabil.boolean_half,lngtrm_disabil.boolean_none,dntl_ins_no,dntl_ins_yes,bereavement.boolean_full,bereavement.boolean_half,bereavement.boolean_none
0,1,0,0,1,0,0,1,0,1,0,...,0,1,0,1,0,0,1,1,0,0
1,1,0,0,0,0,1,0,1,0,1,...,0,1,1,0,0,0,1,1,0,0
2,1,0,0,1,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,1,0
3,0,1,0,1,0,0,0,1,0,1,...,0,1,0,1,0,0,1,1,0,0
4,1,0,0,1,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0


Unnamed: 0,wage1.wage,wage2.wage,wage3.wage,dur,hours.hrs,Escore,stby_pay,educ_allw.boolean
0,0.6,0.394348,0.617204,0.0,1.0,0.0,0.08,0.333333
1,0.5,0.76,0.617204,0.5,0.615385,0.0,0.12,0.333333
2,0.360714,0.394348,0.617204,0.5,0.846154,0.0,0.2,0.333333
3,0.34,0.4,0.967742,1.0,1.0,0.0,0.12,0.333333
4,0.5,0.5,0.967742,1.0,1.0,0.0,0.12,0.5


In [33]:
processed_ln_data = pd.concat([one_hot_nominal_features, norm_f, ln_data['empl_hplan']], axis=1)
# turn the label to good = 1, bad = 0
processed_ln_data.loc[processed_ln_data['empl_hplan']=='good', "empl_hplan"] = 1
processed_ln_data.loc[processed_ln_data['empl_hplan']=='bad', "empl_hplan"] = 0

processed_ln_data.head()

Unnamed: 0,cola_none,cola_tc,cola_tcf,pension_empl_contr,pension_none,pension_ret_allw,shift_diff_no,shift_diff_yes,holidays_average,holidays_below average,...,bereavement.boolean_none,wage1.wage,wage2.wage,wage3.wage,dur,hours.hrs,Escore,stby_pay,educ_allw.boolean,empl_hplan
0,1,0,0,1,0,0,1,0,1,0,...,0,0.6,0.394348,0.617204,0.0,1.0,0.0,0.08,0.333333,1
1,1,0,0,0,0,1,0,1,0,1,...,0,0.5,0.76,0.617204,0.5,0.615385,0.0,0.12,0.333333,1
2,1,0,0,1,0,0,1,0,0,0,...,0,0.360714,0.394348,0.617204,0.5,0.846154,0.0,0.2,0.333333,1
3,0,1,0,1,0,0,0,1,0,1,...,0,0.34,0.4,0.967742,1.0,1.0,0.0,0.12,0.333333,1
4,1,0,0,1,0,0,1,0,1,0,...,0,0.5,0.5,0.967742,1.0,1.0,0.0,0.12,0.5,1


In [34]:
X, y = processed_ln_data.iloc[:,:-1], processed_ln_data.iloc[:,-1].astype('int')

In [35]:
print("Number of class 0 in train:", len(y[y==0]))
print("Number of class 1 in train:", len(y[y==1]))

Number of class 0 in train: 20
Number of class 1 in train: 37


In [36]:
model_names3 = model_names + model_names2
model_pipes3 = model_pipes + model_pipes2
model_params3 = model_params + model_params2

In [37]:
acc_dict["labor-relations"] = train(X.to_numpy(), y.to_numpy(), model_names3, 
                                    model_pipes3, model_params3, cv=10, acc_list=acc_dict["labor-relations"])

Decision_tree:
Best parameters found:
 {'feature_selection__k': 13, 'Decision_tree__min_samples_split': 5, 'Decision_tree__max_depth': 22}
Test Accuracy in 10-fold: [1.         0.83333333 1.         0.66666667 1.         1.
 0.83333333 1.         0.8        1.        ]
Average Test Accuracy: 0.9133

Time: 1.47560715675354
Random_forest:
Best parameters found:
 {'feature_selection__k': 21, 'Random_forest__n_estimators': 75, 'Random_forest__min_samples_split': 2, 'Random_forest__max_depth': 50}
Test Accuracy in 10-fold: [1.         0.83333333 1.         1.         1.         1.
 0.83333333 0.8        0.8        1.        ]
Average Test Accuracy: 0.9267

Time: 66.29797601699829
SVM:
Best parameters found:
 {'feature_selection__k': 20, 'SVM__kernel': 'rbf', 'SVM__degree': 4, 'SVM__C': 1}
Test Accuracy in 10-fold: [1.         1.         1.         0.83333333 1.         1.
 0.83333333 0.8        0.8        1.        ]
Average Test Accuracy: 0.9267

Time: 1.7428090572357178
KNN:
Best paramete

### Question 7 (heart disease dataset)

In [38]:
df_hd = pd.read_csv('./Heart Disease/heart_cleveland_upload.csv')

In [39]:
df_hd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [40]:
numeric_f = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'] # ordinal or numeric features
categorical_f = ['cp', 'restecg', 'slope', 'ca', 'thal'] # features need to be one-hot
binary_f = ['sex' , 'fbs', 'exang'] # do nothing for binary features

# normalize the numeric and ordinal features first
scaler = MinMaxScaler()
norm_f = scaler.fit_transform(df_hd[numeric_f])
norm_f = pd.DataFrame(norm_f, columns=numeric_f)

# turn nominal features to one-hot 
one_hot_f = pd.get_dummies(df_hd[categorical_f].astype("category"))

X = pd.concat([norm_f, one_hot_f, df_hd[binary_f]], axis=1)
y = df_hd['condition'].astype('int')

In [41]:
print("Number of class 0 in train:", len(y[y==0]))
print("Number of class 1 in train:", len(y[y==1]))

Number of class 0 in train: 160
Number of class 1 in train: 137


In [42]:
acc_dict["heart-disease"] = train(X.to_numpy(), y.to_numpy(), model_names3, 
                                  model_pipes3, model_params3, cv=10, acc_list=acc_dict["heart-disease"] )

Decision_tree:
Best parameters found:
 {'feature_selection__k': 10, 'Decision_tree__min_samples_split': 5, 'Decision_tree__max_depth': 30}
Test Accuracy in 10-fold: [0.56666667 0.73333333 0.86666667 0.9        0.83333333 0.9
 0.96666667 0.82758621 0.75862069 0.68965517]
Average Test Accuracy: 0.8043

Time: 1.6614909172058105
Random_forest:
Best parameters found:
 {'feature_selection__k': 10, 'Random_forest__n_estimators': 150, 'Random_forest__min_samples_split': 6, 'Random_forest__max_depth': 86}
Test Accuracy in 10-fold: [0.63333333 0.7        0.86666667 0.93333333 0.86666667 0.86666667
 0.96666667 0.82758621 0.79310345 0.79310345]
Average Test Accuracy: 0.8247

Time: 72.4586009979248
SVM:
Best parameters found:
 {'feature_selection__k': 19, 'SVM__kernel': 'sigmoid', 'SVM__degree': 9, 'SVM__C': 0.1}
Test Accuracy in 10-fold: [0.53333333 0.86666667 0.9        0.9        0.76666667 0.9
 1.         0.86206897 0.79310345 0.82758621]
Average Test Accuracy: 0.8349

Time: 8.304618120193481
K

### Question 7 Table (accuracies of the six (6) algorithms against the five (5) datasets)

In [60]:
acc_matrix = pd.DataFrame.from_dict(acc_dict, orient='index', columns=["DT", "RF", "SVM", "KNN", "MLP", "GB"])
print("Accuracy table (algorithms vs datasets):")
acc_matrix

Accuracy table (algorithms vs datasets):


Unnamed: 0,DT,RF,SVM,KNN,MLP,GB
dataset_d,0.9735,0.9804,0.9804,0.9804,0.9804,0.9804
DB1,0.9549,0.973,0.86,0.8886,0.9428,0.9735
DB2,0.5348,0.6542,0.6694,0.6417,0.6512,0.6022
labor-relations,0.9133,0.9267,0.9267,0.9067,0.7367,0.9467
heart-disease,0.8043,0.8247,0.8349,0.8484,0.8182,0.8451


In [61]:
rank_matrix = acc_matrix.rank(1, ascending=False, method='min')
rank_matrix

Unnamed: 0,DT,RF,SVM,KNN,MLP,GB
dataset_d,6.0,1.0,1.0,1.0,1.0,1.0
DB1,3.0,2.0,6.0,5.0,4.0,1.0
DB2,6.0,2.0,1.0,4.0,3.0,5.0
labor-relations,4.0,2.0,2.0,5.0,6.0,1.0
heart-disease,6.0,4.0,3.0,1.0,5.0,2.0


In [62]:
# average rank
avg_rank = rank_matrix.mean(axis=0).to_frame(name="avg_rank").T
k, n = 6, 5
R_hat = (k+1)/2
avg_rank

Unnamed: 0,DT,RF,SVM,KNN,MLP,GB
avg_rank,5.0,2.2,2.6,3.2,3.8,2.0


In [63]:
row_sum_dff = ((rank_matrix - R_hat)**2).sum(axis=1).to_frame(name="row_sum_dff")
rank_matrix = pd.concat([rank_matrix, row_sum_dff], axis=1)
rank_matrix

Unnamed: 0,DT,RF,SVM,KNN,MLP,GB,row_sum_dff
dataset_d,6.0,1.0,1.0,1.0,1.0,1.0,37.5
DB1,3.0,2.0,6.0,5.0,4.0,1.0,17.5
DB2,6.0,2.0,1.0,4.0,3.0,5.0,17.5
labor-relations,4.0,2.0,2.0,5.0,6.0,1.0,19.5
heart-disease,6.0,4.0,3.0,1.0,5.0,2.0,17.5


##### $\frac{1}{n(k-1)}\sum_{ij}(R_{ij}-\bar R)^2$

In [64]:
row_sum_dff = row_sum_dff.sum().values[0]/(n*(k-1))
row_sum_dff

4.38

##### $n\sum_j(R_j-\bar R)^2$

In [65]:
# equation 2
column_sum_dff = n*((avg_rank - 3.5)**2).sum(axis=1).values[0]
column_sum_dff

35.89999999999999

In [77]:
fredman_stat = column_sum_dff/row_sum_dff
print("Friedman statistic:", fredman_stat)

Friedman statistic: 8.196347031963468


The above table and results is not the highest accuracy one (only 30 iterations in random search)

### Steps for Friedman test

$k = 6$ and $n = 5$

$\bar R = \frac{6+1}{2}=3.5$

The sum of squared differences $n\sum_j(R_j-\bar R)^2 = 5\times(1.5^2 + 1.3^2 + 0.9^2 + 0.3^2 + 0.3^2 + 1.5^2)=35.9$ 

The sum of squared differences $\frac{1}{n(k-1)}\sum_{ij}(R_{ij}-\bar R)^2 = \frac{1}{25}\times 109.5=4.38$

Friedman statistic = $\frac{35.9}{4.38}=8.20$

The critical value is 10.49 if $\alpha=0.05$

Since abs(Friedman statistic) < 10.49, there is not significant difference between the 6 algorithms.