# Machine Learning Exercise 2 - Regression and AutoML

In [14]:
import time
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

In [15]:
#legacy ex1
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


In [None]:
def build_generic(fit_fun, params, X_train, X_test, y_train, y_test):
    timings = []
    scores = []
    scoring = {'accuracy' : make_scorer(accuracy_score, ), 
               'precision' : make_scorer(precision_score, average = 'macro', zero_division = 0),
               'recall' : make_scorer(recall_score, average = 'macro', zero_division = 0), 
               'f1_score' : make_scorer(f1_score, average = 'macro'),
              }
    best_model = 'none'
    ho_accs = []
    
    for p in params:
        start = time.time()
        model = eval(f'{fit_fun}(p, X_train, y_train)')
        timings.append(time.time() - start)
        
        X = X_train.append(X_test)
        y = y_train.append(y_test)
        res_cv = cross_validate(model, X, y, cv = 5, scoring = scoring)
        scores.append(res_cv)
        
        # holdout accuracy
        ho_acc = accuracy_score(y_test, model.predict(X_test))
        ho_accs.append(ho_acc)
        
        best_model = model
        
    return timings, scores, params, best_model, ho_accs

In [None]:
# KNN model builder
def fit_knn(params, X_train, y_train):
    knn_model = KNeighborsClassifier(**params)
    knn_model.fit(X_train, y_train)
    return knn_model
    
def build_knn(X_train, X_test, y_train, y_test):
    params = []
    params.append({'n_neighbors':1})
    params.append({'n_neighbors':5})
    params.append({'n_neighbors':10})
    
    return build_generic('fit_knn', params, X_train, X_test, y_train, y_test)    

In [None]:
# Tree model builder
def fit_tree(params, X_train, y_train):
    tree_model = DecisionTreeClassifier(**params)
    tree_model.fit(X_train, y_train)
    return tree_model
    
def build_tree(X_train, X_test, y_train, y_test):
    params = []
    params.append({'max_depth':5, 'min_samples_leaf': 4})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'best'})
    params.append({'max_depth':20, 'min_samples_leaf': 4, 'splitter': 'random'})

    return build_generic('fit_tree', params, X_train, X_test, y_train, y_test)    

In [None]:
# MLP model builder
def fit_mlp(params, X_train, y_train):
    mlp_model = MLPClassifier(**params)
    mlp_model.fit(X_train, y_train)
    return mlp_model
    
def build_mlp(X_train, X_test, y_train, y_test):
    params = []
    params.append({'early_stopping': True, 'solver': 'adam'})
    params.append({'early_stopping': True, 'solver': 'sgd', 'learning_rate': 'adaptive'})
    params.append({'early_stopping': True, 'solver': 'lbfgs', 'max_fun': 15000, 'max_iter': 300})
    
    return build_generic('fit_mlp', params, X_train, X_test, y_train, y_test)    

In [None]:
def build_models(X_train, X_test, y_train, y_test):
    
    knn_timings, knn_scores, knn_params, knn_model, knn_ho_acc = build_knn(X_train, X_test, y_train, y_test)
    tree_timings, tree_scores, tree_params, tree_model, tree_ho_acc = build_tree(X_train, X_test, y_train, y_test)
    mlp_timings, mlp_scores, mlp_params, mlp_model, mlp_ho_acc = build_mlp(X_train, X_test, y_train, y_test)
    
    idx = pd.MultiIndex.from_product([['KNN', 'TREE', 'MLP'],['params', 'time', 'holdout accuracy', 'accuracy', 'precision', 'recall', 'f1_score']])
    
    data = []
    for i in range(3):
        row = [
            knn_params[i], knn_timings[i], knn_ho_acc[i], knn_scores[i].get('test_accuracy').mean(), knn_scores[i].get('test_precision').mean(), knn_scores[i].get('test_recall').mean(), knn_scores[i].get('test_f1_score').mean(),
            tree_params[i], tree_timings[i], tree_ho_acc[i], tree_scores[i].get('test_accuracy').mean(), tree_scores[i].get('test_precision').mean(), tree_scores[i].get('test_recall').mean(), tree_scores[i].get('test_f1_score').mean(),
            mlp_params[i], mlp_timings[i], mlp_ho_acc[i], mlp_scores[i].get('test_accuracy').mean(), mlp_scores[i].get('test_precision').mean(), mlp_scores[i].get('test_recall').mean(), mlp_scores[i].get('test_f1_score').mean(),
        ]
        data.append(row)

    results = pd.DataFrame(data, columns = idx, index = [0,1,2])
    return results

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 999)


notebook_time = time.time()

## Breast Cancer Data

In [None]:
# https://www.kaggle.com/c/184702-tu-ml-ws-21-breast-cancer/data#
breastcancer = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancer

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,fractalDimensionMean,radiusStdErr,textureStdErr,perimeterStdErr,areaStdErr,smoothnessStdErr,compactnessStdErr,concavityStdErr,concavePointsStdErr,symmetryStdErr,fractalDimensionStdErr,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,913102,False,14.640,16.85,94.21,666.0,0.08641,0.06698,0.05192,0.02791,0.1409,0.05355,0.2204,1.0060,1.471,19.98,0.003535,0.01393,0.01800,0.006144,0.01254,0.001219,16.460,25.44,106.00,831.0,0.11420,0.20700,0.2437,0.07828,0.2455,0.06596
1,89511501,False,12.200,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,0.2575,0.8073,1.959,19.01,0.005403,0.01418,0.01051,0.005142,0.01333,0.002065,13.750,21.38,91.11,583.1,0.12560,0.19280,0.1167,0.05556,0.2661,0.07961
2,87163,True,13.430,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.1470,3.142,43.40,0.006003,0.01063,0.02151,0.009443,0.01520,0.001868,17.980,29.87,116.60,993.6,0.14010,0.15460,0.2644,0.11600,0.2884,0.07371
3,894047,False,8.597,18.60,54.09,221.2,0.10740,0.05847,0.00000,0.00000,0.2163,0.07359,0.3368,2.7770,2.222,17.81,0.020750,0.01403,0.00000,0.000000,0.06146,0.006820,8.952,22.44,56.65,240.1,0.13470,0.07767,0.0000,0.00000,0.3142,0.08116
4,86409,False,14.260,19.65,97.83,629.9,0.07837,0.22330,0.30030,0.07798,0.1704,0.07769,0.3628,1.4900,3.399,29.25,0.005298,0.07446,0.14350,0.022920,0.02566,0.012980,15.300,23.73,107.00,709.0,0.08949,0.41930,0.6783,0.15050,0.2398,0.10820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,873592,True,27.220,21.87,182.10,2250.0,0.10940,0.19140,0.28710,0.18780,0.1800,0.05770,0.8361,1.4810,5.820,128.70,0.004631,0.02537,0.03109,0.012410,0.01575,0.002747,33.120,32.85,220.80,3216.0,0.14720,0.40340,0.5340,0.26880,0.2856,0.08082
281,857793,True,14.710,21.59,95.55,656.9,0.11370,0.13650,0.12930,0.08123,0.2027,0.06758,0.4226,1.1500,2.735,40.09,0.003659,0.02855,0.02572,0.012720,0.01817,0.004108,17.870,30.70,115.70,985.5,0.13680,0.42900,0.3587,0.18340,0.3698,0.10940
282,857373,False,13.640,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.670,23.19,96.08,656.7,0.10890,0.15820,0.1050,0.08586,0.2346,0.08025
283,866203,True,19.000,18.91,123.40,1138.0,0.08217,0.08028,0.09271,0.05627,0.1946,0.05044,0.6896,1.3420,5.216,81.23,0.004428,0.02731,0.04040,0.013610,0.02030,0.002686,22.320,25.73,148.20,1538.0,0.10210,0.22640,0.3207,0.12180,0.2841,0.06541


In [None]:
# https://archive-beta.ics.uci.edu/ml/datasets/seoul+bike+sharing+demand
seoulbike = pd.read_csv('seoulbike/SeoulBikeData.csv')
seoulbike

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(degC),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(degC),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,30/11/2018,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,30/11/2018,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,30/11/2018,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,30/11/2018,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


In [None]:
# https://www.kaggle.com/prathamtripathi/regression-with-neural-networking
concrete = pd.read_csv('concrete/concrete_data.csv')
concrete

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [None]:
print(f'notebook took this long in seconds: {time.time()-notebook_time}')

notebook took this long in seconds: 0.15799927711486816
