# Machine Learning Exercise 1 - Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV

import time

In [2]:
def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timings = []
    scores = []
    scoring = {'accuracy' : make_scorer(accuracy_score, ), 
               'precision' : make_scorer(precision_score, average = 'macro', zero_division = 0),
               'recall' : make_scorer(recall_score, average = 'macro', zero_division = 0), 
               'f1_score' : make_scorer(f1_score, average = 'macro'),
              }
    best_model = 'none'
    ho_accs = []
    
    for p in params:
        start = time.time()
        model = eval(f'{fit_fun}(p, X_train, y_train)')
        timings.append(time.time() - start)
        
        X = X_train.append(X_test)
        y = y_train.append(y_test)
        res_cv = cross_validate(model, X, y, cv = 5, scoring = scoring)
        scores.append(res_cv)
        
        # holdout accuracy
        ho_acc = accuracy_score(y_test, model.predict(X_test))
        ho_accs.append(ho_acc)
        
        best_model = model
        
    return timings, scores, params, best_model, ho_accs

In [3]:
# KNN model builder
def fit_knn(params, X_train, y_train):
    knn_model = KNeighborsClassifier(**params)
    knn_model.fit(X_train, y_train)
    return knn_model
    
def build_knn(X_train, X_test, y_train, y_test):
    params = []
    params.append({'n_neighbors':1})
    params.append({'n_neighbors':5})
    params.append({'n_neighbors':10})
    
    return build_generic('fit_knn', params, X_train, X_test, y_train, y_test)    

In [4]:
# Tree model builder
def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeClassifier(**params)
    tree_model.fit(X_train, y_train)
    return tree_model
    
def build_tree(X_train, X_test, y_train, y_test):
    params = []
    params.append({'max_depth':5, 'min_samples_leaf': 4})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'best'})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'random'})

    return build_generic('fit_tree', params, X_train, X_test, y_train, y_test)    

In [5]:
# MLP model builder
def fit_mlp(params, X_train, y_train):
    mlp_model = MLPClassifier(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model
    
def build_mlp(X_train, X_test, y_train, y_test):
    params = []
    params.append({'early_stopping': True, 'solver': 'adam'})
    params.append({'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'})
    params.append({'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300})
    
    return build_generic('fit_mlp', params, X_train, X_test, y_train, y_test)    

In [6]:
def build_models(X_train, X_test, y_train, y_test):
    
    knn_timings, knn_scores, knn_params, knn_model, knn_ho_acc = build_knn(X_train, X_test, y_train, y_test)
    tree_timings, tree_scores, tree_params, tree_model, tree_ho_acc = build_tree(X_train, X_test, y_train, y_test)
    mlp_timings, mlp_scores, mlp_params, mlp_model, mlp_ho_acc = build_mlp(X_train, X_test, y_train, y_test)
    
    idx = pd.MultiIndex.from_product([['KNN', 'TREE', 'MLP'],['params', 'time', 'holdout accuracy', 'accuracy', 'precision', 'recall', 'f1_score']])
    
    data = []
    for i in range(3):
        row = [
            knn_params[i], knn_timings[i], knn_ho_acc[i], knn_scores[i].get('test_accuracy').mean(), knn_scores[i].get('test_precision').mean(), knn_scores[i].get('test_recall').mean(), knn_scores[i].get('test_f1_score').mean(),
            tree_params[i], tree_timings[i], tree_ho_acc[i], tree_scores[i].get('test_accuracy').mean(), tree_scores[i].get('test_precision').mean(), tree_scores[i].get('test_recall').mean(), tree_scores[i].get('test_f1_score').mean(),
            mlp_params[i], mlp_timings[i], mlp_ho_acc[i], mlp_scores[i].get('test_accuracy').mean(), mlp_scores[i].get('test_precision').mean(), mlp_scores[i].get('test_recall').mean(), mlp_scores[i].get('test_f1_score').mean(),
        ]
        data.append(row)

    results = pd.DataFrame(data, columns = idx, index = [0,1,2])
    return results

In [7]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 999)


notebook_time = time.time()

## Mushroom Edibility

In [8]:
mushrooms = pd.read_csv('./mushrooms/mushrooms.csv')
# encode labels
mushrooms = mushrooms.apply(LabelEncoder().fit_transform)

mushrooms_X = mushrooms.drop('edibility', axis=1)
mushrooms_y = mushrooms['edibility']

# these features are enough to classify the whole dataset, see agaricus-lepiota.names
mushrooms_X = mushrooms_X[['odor', 'spore-print-color', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'habitat', 'cap-color']]

In [9]:
mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test = train_test_split(mushrooms_X, mushrooms_y)

mushrooms_results = build_models(mushrooms_X_train, mushrooms_X_test, mushrooms_y_train, mushrooms_y_test)
mushrooms_results

Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.020045,1.0,1.0,1.0,1.0,1.0,"{'max_depth': 5, 'min_samples_leaf': 4}",0.004741,0.997148,0.996911,0.996982,0.996817,0.996897,"{'early_stopping': True, 'solver': 'adam'}",7.574387,1.0,0.985029,0.985613,0.984474,0.984939
1,{'n_neighbors': 5},0.01255,1.0,1.0,1.0,1.0,1.0,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.004382,1.0,0.999406,0.999368,0.999443,0.999404,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",13.978887,0.84173,0.84411,0.843759,0.843918,0.843462
2,{'n_neighbors': 10},0.011623,0.998574,0.998218,0.998254,0.99817,0.99821,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.003732,1.0,1.0,1.0,1.0,1.0,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",4.677562,1.0,1.0,1.0,1.0,1.0


## Soybeans

In [10]:
soybeans = pd.read_csv('./soybeans/soybean_cleaned.csv')

# handle missing values by dropping, see pdf for more info
soybeans.dropna(inplace = True)

# encode labels
soybeans = soybeans.apply(LabelEncoder().fit_transform)

soybeans_X = soybeans.drop('class', axis=1)
soybeans_y = soybeans['class']

soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test = train_test_split(soybeans_X, soybeans_y)

In [11]:
soybeans_results = build_models(soybeans_X_train, soybeans_X_test, soybeans_y_train, soybeans_y_test)
soybeans_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.011839,0.758865,0.814776,0.870428,0.80306,0.817262,"{'max_depth': 5, 'min_samples_leaf': 4}",0.003137,0.64539,0.734655,0.651887,0.667943,0.63094,"{'early_stopping': True, 'solver': 'adam'}",0.373879,0.787234,0.677845,0.641534,0.596803,0.591462
1,{'n_neighbors': 5},0.002803,0.751773,0.772124,0.82596,0.734766,0.744275,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.003204,0.822695,0.848641,0.886835,0.856238,0.852437,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",1.106609,0.404255,0.40019,0.142914,0.188187,0.146014
2,{'n_neighbors': 10},0.002672,0.695035,0.741814,0.761978,0.674172,0.68502,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.003291,0.801418,0.853872,0.904478,0.873197,0.875416,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",1.156057,0.843972,0.882506,0.928479,0.910351,0.913101


## Breast Cancer Data

In [12]:
breastcancer_train = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer_sol_input = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.tes.csv')


breastcancer_train = breastcancer_train.drop('ID', axis=1)
breastcancer_sol_input = breastcancer_sol_input.drop('ID', axis=1)

breastcancer_X = breastcancer_train.drop('class', axis=1)
breastcancer_y = breastcancer_train['class']

breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test  = train_test_split(breastcancer_X, breastcancer_y)

In [13]:
breastcancer_results = build_models(breastcancer_X_train, breastcancer_X_test, breastcancer_y_train, breastcancer_y_test)
breastcancer_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.005161,0.875,0.919298,0.921253,0.90266,0.909185,"{'max_depth': 5, 'min_samples_leaf': 4}",0.00362,0.875,0.926316,0.917507,0.921728,0.918994,"{'early_stopping': True, 'solver': 'adam'}",0.059603,0.652778,0.712281,0.512441,0.607568,0.543458
1,{'n_neighbors': 5},0.00208,0.861111,0.926316,0.932993,0.908257,0.916622,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.003726,0.875,0.926316,0.918895,0.919431,0.918831,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",0.256346,0.805556,0.761404,0.566553,0.654595,0.598575
2,{'n_neighbors': 10},0.002134,0.888889,0.926316,0.936471,0.9034,0.915295,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.002185,0.902778,0.919298,0.91701,0.905505,0.908977,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",0.453078,0.875,0.764912,0.769962,0.751984,0.756232


## Purchase Data

In [14]:
purchase_train = pd.read_csv('./purchase/purchase600-100cls-15k.lrn.csv')
purchase_sol_input = pd.read_csv('./purchase/purchase600-100cls-15k.tes.csv')

# labels do not need to be encoded, inputs are numeric
#purchase_train = purchase_train.apply(LabelEncoder().fit_transform)

purchase_train = purchase_train.drop('ID', axis=1)
purchase_sol_input = purchase_sol_input.drop('ID', axis=1)

purchase_X = purchase_train.drop('class', axis=1)
purchase_y = purchase_train['class']

purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test  = train_test_split(purchase_X, purchase_y, test_size = 0.5)

In [15]:
purchase_results = build_models(purchase_X_train, purchase_X_test, purchase_y_train, purchase_y_test)
purchase_results



Unnamed: 0_level_0,KNN,KNN,KNN,KNN,KNN,KNN,KNN,TREE,TREE,TREE,TREE,TREE,TREE,TREE,MLP,MLP,MLP,MLP,MLP,MLP,MLP
Unnamed: 0_level_1,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score,params,time,holdout accuracy,accuracy,precision,recall,f1_score
0,{'n_neighbors': 1},0.339004,0.197,0.2043,0.211772,0.201109,0.185392,"{'max_depth': 5, 'min_samples_leaf': 4}",0.188534,0.082,0.0913,0.038687,0.069879,0.040982,"{'early_stopping': True, 'solver': 'adam'}",7.31982,0.6182,0.6933,0.700457,0.670042,0.67542
1,{'n_neighbors': 5},0.347923,0.236,0.2538,0.265003,0.235108,0.21214,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'best'}",0.727344,0.0968,0.1011,0.09203,0.090956,0.087484,"{'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'}",33.792735,0.5602,0.6864,0.675322,0.638891,0.6455
2,{'n_neighbors': 10},0.6083,0.2596,0.2821,0.305693,0.2612,0.239023,"{'max_depth': 20, 'min_samples_leaf': 4, 'splitter': 'random'}",0.782897,0.0944,0.1001,0.09364,0.090402,0.087218,"{'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300}",15.700455,0.6394,0.7325,0.72595,0.711709,0.711849


In [16]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 1246.6559772491455
