## Import modules

In [1]:
import pandas as pd # for data handling
from sklearn.model_selection import cross_val_score # for cross-validation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # evaluation metrics
import matplotlib.pyplot as plt # for plotting

# scikit-learn classifiers evaluated (change as desired)
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Get data

### Extract *CSV* files from *zip* file


In [2]:
! unzip '/content/drive/MyDrive/Colab Notebooks/courses/sklearn_classifiers/homework/hw2.q2.data.zip'

unzip:  cannot find or open /content/drive/MyDrive/Colab Notebooks/courses/sklearn_classifiers/homework/hw2.q2.data.zip, /content/drive/MyDrive/Colab Notebooks/courses/sklearn_classifiers/homework/hw2.q2.data.zip.zip or /content/drive/MyDrive/Colab Notebooks/courses/sklearn_classifiers/homework/hw2.q2.data.zip.ZIP.


### Read data into *pandas* dataframes

In [3]:
# Read data from CSV files into pandas dataframes
train = pd.read_csv('./final.q3.train.csv') # training data
test = pd.read_csv('./final.q3.test.csv') # test data
new = pd.read_csv('./final.q3.new.csv') # unlabeled data
# Show number of rows and columns in each dataframe
print('train contains %d rows and %d columns' %train.shape)
print('test contains %d rows and %d columns' %test.shape)
print('new contains %d rows and %d columns' %new.shape)
print('First 3 rows in train:')
train.head(3) # display first 3 training samples

train contains 8000 rows and 11 columns
test contains 2000 rows and 11 columns
new contains 30 rows and 11 columns
First 3 rows in train:


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,3.0,-1.135,-3.003,-0.041,2.683,-1.122,0.936,-0.388,1.61,-1.385,-1.909
1,2.0,-1.243,-1.825,-1.36,0.02,1.61,0.581,3.659,0.693,-0.838,0.009
2,1.0,2.885,2.161,3.516,-0.23,-0.729,1.82,0.235,0.532,-2.33,1.174


### Specify inputs and outputs







In [4]:
features = list(train)[1:] # all but the first column header are feature names
print("features:", features)
X_train, X_test, X_new = train[features], test[features], new[features]
y_train, y_test = train.y, test.y
print('Shapes:')
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, X_new: {X_new.shape}')
print(f'y_train: {y_train.shape}, y_test: {y_test.shape}')

features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
Shapes:
X_train: (8000, 10), X_test: (2000, 10), X_new: (30, 10)
y_train: (8000,), y_test: (2000,)


## Evaluate models using *k*-fold cross-validation

### GaussianNB

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [5]:
%%time
model = GaussianNB() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.8717
CPU times: user 17.7 ms, sys: 2.19 ms, total: 19.9 ms
Wall time: 20.6 ms


### DecisionTreeClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [6]:
%%time
model = DecisionTreeClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.8189
CPU times: user 369 ms, sys: 2.64 ms, total: 372 ms
Wall time: 383 ms


### RandomForestClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [7]:
%%time
model = RandomForestClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9175
CPU times: user 6.19 s, sys: 16.4 ms, total: 6.21 s
Wall time: 6.41 s


### ExtraTreesClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [8]:
%%time
model = ExtraTreesClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9239
CPU times: user 1.86 s, sys: 15.8 ms, total: 1.87 s
Wall time: 1.93 s


### KNeighborsClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [9]:
%%time
model = KNeighborsClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9303
CPU times: user 330 ms, sys: 2.18 ms, total: 332 ms
Wall time: 343 ms


### LogisticRegression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [10]:
%%time
model = LogisticRegression() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.8924
CPU times: user 2.36 s, sys: 679 ms, total: 3.04 s
Wall time: 275 ms


### SVC

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [11]:
%%time
model = SVC() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9326
CPU times: user 5.4 s, sys: 530 ms, total: 5.93 s
Wall time: 4.07 s


### MLPClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [12]:
%%time
model = MLPClassifier(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=2).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9249
CPU times: user 1min 57s, sys: 35.9 s, total: 2min 32s
Wall time: 14.2 s


In [13]:
%%time
model = MLPClassifier(hidden_layer_sizes=(1000, 100), max_iter=1000, verbose=True)
for k,v in model.get_params().items():
    print(f'{k}: {v}')
model.fit(X_train, y_train)
predicted = model.predict(X_test) # predicted classes for test examples
acc = accuracy_score(y_test, predicted) # accuracy on test samples
print(f'Accuracy on test samples = {acc:0.4f}') # show test accuracy
print("Classification report on test samples:") # for precision, recall, F1-score
print(classification_report(y_test, predicted, digits=4)) # rounded to 4 decimal places

activation: relu
alpha: 0.0001
batch_size: auto
beta_1: 0.9
beta_2: 0.999
early_stopping: False
epsilon: 1e-08
hidden_layer_sizes: (1000, 100)
learning_rate: constant
learning_rate_init: 0.001
max_fun: 15000
max_iter: 1000
momentum: 0.9
n_iter_no_change: 10
nesterovs_momentum: True
power_t: 0.5
random_state: None
shuffle: True
solver: adam
tol: 0.0001
validation_fraction: 0.1
verbose: True
warm_start: False
Iteration 1, loss = 0.48453601
Iteration 2, loss = 0.34276719
Iteration 3, loss = 0.31927481
Iteration 4, loss = 0.30366783
Iteration 5, loss = 0.29626204
Iteration 6, loss = 0.28809773
Iteration 7, loss = 0.28574598
Iteration 8, loss = 0.27604432
Iteration 9, loss = 0.27221838
Iteration 10, loss = 0.26967006
Iteration 11, loss = 0.26549105
Iteration 12, loss = 0.26241710
Iteration 13, loss = 0.25695664
Iteration 14, loss = 0.25372934
Iteration 15, loss = 0.25082583
Iteration 16, loss = 0.24344232
Iteration 17, loss = 0.24522990
Iteration 18, loss = 0.24123008
Iteration 19, loss = 0

## Select a good model
Since both *Support Vector Classifier* and *K Nearest Neighbor Classifier* produced high cmean cross-validation accuracy with default hyper-parameter values, we shall search for good hyper-parameter values for these models using cross-validation and choose a model with a good set of hyper-parameter values.

### SVC
Search for a good value of the penalty *C*.

In [14]:
%%time
for penalty in [1, 10, 20]: # values to try
    model = SVC(C=penalty)
    score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy = {score:0.4f} for SVC with C = {penalty:0.1f}')

Mean cross-validation accuracy = 0.9326 for SVC with C = 1.0
Mean cross-validation accuracy = 0.9355 for SVC with C = 10.0
Mean cross-validation accuracy = 0.9330 for SVC with C = 20.0
CPU times: user 14.9 s, sys: 472 ms, total: 15.4 s
Wall time: 14 s


### KNeighborsClassifier
Search for a good value of the number of nearest neighbors

In [15]:
for k in range(1,10): # values to try
    model = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy = {score:0.4f} for KNeighborsClassifier with {k} neighbors')

Mean cross-validation accuracy = 0.8804 for KNeighborsClassifier with 1 neighbors
Mean cross-validation accuracy = 0.8732 for KNeighborsClassifier with 2 neighbors
Mean cross-validation accuracy = 0.9243 for KNeighborsClassifier with 3 neighbors
Mean cross-validation accuracy = 0.9261 for KNeighborsClassifier with 4 neighbors
Mean cross-validation accuracy = 0.9303 for KNeighborsClassifier with 5 neighbors
Mean cross-validation accuracy = 0.9303 for KNeighborsClassifier with 6 neighbors
Mean cross-validation accuracy = 0.9297 for KNeighborsClassifier with 7 neighbors
Mean cross-validation accuracy = 0.9299 for KNeighborsClassifier with 8 neighbors
Mean cross-validation accuracy = 0.9307 for KNeighborsClassifier with 9 neighbors


### Decide on a model
We shall choose *SVC* with *C*=10.

In [16]:
chosen_model = SVC(C=10)
print('Selected model: ', chosen_model)
print('Parameters')
for param, val in chosen_model.get_params().items():
    print(f'\t{param}: {val}')

Selected model:  SVC(C=10)
Parameters
	C: 10
	break_ties: False
	cache_size: 200
	class_weight: None
	coef0: 0.0
	decision_function_shape: ovr
	degree: 3
	gamma: scale
	kernel: rbf
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False


## Train and test selected model

In [17]:
%%time
chosen_model = SVC(C=10)
chosen_model.fit(X_train, y_train) # train selected model on ALL training examples
predicted = chosen_model.predict(X_test) # predicted classes for test examples
acc = accuracy_score(y_test, predicted) # accuracy on test samples
print(f'Accuracy on test samples = {acc:0.4f}') # show test accuracy
print("Classification report on test samples:") # for precision, recall, F1-score
print(classification_report(y_test, predicted, digits=4)) # rounded to 4 decimal places

Accuracy on test samples = 0.9435
Classification report on test samples:
              precision    recall  f1-score   support

         0.0     0.9556    0.9498    0.9527       498
         1.0     0.9184    0.9546    0.9362       507
         2.0     0.9420    0.9364    0.9392       503
         3.0     0.9603    0.9329    0.9464       492

    accuracy                         0.9435      2000
   macro avg     0.9441    0.9434    0.9436      2000
weighted avg     0.9439    0.9435    0.9436      2000

CPU times: user 1.4 s, sys: 989 µs, total: 1.4 s
Wall time: 1.44 s


In [18]:
cm = pd.DataFrame(confusion_matrix(y_test, predicted)) # confusion matrix
cm.to_csv('cm.hw2.q2.csv')
cm

Unnamed: 0,0,1,2,3
0,473,10,9,6
1,6,484,10,7
2,9,17,471,6
3,7,16,10,459


## Predict class for unlabeled samples
We shall use our trained model to predict the output class for the unlabeled samples.

In [19]:
predicted_new = chosen_model.predict(X_new) # predicted classes for unlabeled samples
hw2q2_prediction = pd.DataFrame() # dataframe with predicted classes
hw2q2_prediction['ID'] = new.ID # identifiers for unlabeled samples
hw2q2_prediction['y'] = predicted_new # # predicted classes for unlabeled samples
hw2q2_prediction.to_csv('hw2.q2.prediction.csv', index=False) # save as CSV file
hw2q2_prediction # display results

Unnamed: 0,ID,y
0,ID_001,0.0
1,ID_002,0.0
2,ID_003,0.0
3,ID_004,2.0
4,ID_005,0.0
5,ID_006,0.0
6,ID_007,0.0
7,ID_008,0.0
8,ID_009,3.0
9,ID_010,0.0
