## Homework 2 - Question 2

In [21]:
import pandas as pd # for data handling
from sklearn.model_selection import cross_val_score # for cross-validation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # evaluation metrics
from sklearn import set_config # enable configure
set_config(print_changed_only=False) # configure print to show all not just changed values
import matplotlib.pyplot as plt # for plotting

# scikit-learn classifiers evaluated (change as desired)
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [22]:
# Read data from CSV files into pandas dataframes
train = pd.read_csv('MMIS671 Final Exam/final.q3.train.csv') # training data
test = pd.read_csv('MMIS671 Final Exam/final.q3.test.csv') # test data
new = pd.read_csv('MMIS671 Final Exam/final.q3.new.csv') # unlabeled data
# Show number of rows and columns in each dataframe
print('train contains %d rows and %d columns' %train.shape)
print('test contains %d rows and %d columns' %test.shape)
print('new contains %d rows and %d columns' %new.shape)
print('First 3 rows in train:') 
train.head(3) # display first 3 training samples 

train contains 10000 rows and 11 columns
test contains 2000 rows and 11 columns
new contains 20 rows and 11 columns
First 3 rows in train:


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,2.0,0.502,-1.444,-1.72,-1.363,1.391,0.587,-0.977,0.703,-1.672,-0.159
1,1.0,-4.329,-5.717,-7.596,-0.472,0.783,0.192,-0.145,5.362,-1.155,2.312
2,0.0,-1.983,1.5,-3.604,-3.164,-0.86,-0.483,1.937,-1.639,0.467,-1.198


### Specify inputs and outputs
- **features**: List of the 16 input feature names
- **X_train**: $4000 \times 16$ array containing input values for training samples.
- **y_train**: Array containing labels for the 4000 training samples.
- **X_test**: $1000 \times 16$ array containing input values for test samples.
- **y_test**: Array containing labels for the 1000 training samples.
- **X_new**: $30 \times 16$ array containing input values for unlabeled samples.






In [23]:
features = list(train)[1:] # all but the first column header are feature names
print("features:", features)
X_train, X_test, X_new = train[features], test[features], new[features]
y_train, y_test = train.y, test.y
print('Shapes:')
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, X_new: {X_new.shape}')
print(f'y_train: {y_train.shape}, y_test: {y_test.shape}')

features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
Shapes:
X_train: (10000, 10), X_test: (2000, 10), X_new: (20, 10)
y_train: (10000,), y_test: (2000,)


## Evaluate models using *k*-fold cross-validation
We shall use **4**-fold cross-validation so that 3000 of the 4000 training samples are used for training and the remaining 1000 samples are used for validation in each fold. The mean cross-validation accuracy for each model with chosen hyper-parameters on the 4 runs will be computed using the command:
- **score = cross_val_score(model, X_train, y_train, cv=4).mean()**
> - *model*: classifier object with specified hyperparameters
> - *X_train*, *y_train*: Inputs and output labels for training
> - *cv*: number of folds in cross-validation
> - *mean*(): computes mean accuracy from the *cv* runs 

You can look up the documentation for each classifier, change hyper-parameter values, and observe the results. We shall also observe the time it takes to train and evaluate each model 4 times in this *4*-fold cross-validation process.


### GaussianNB

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [24]:
%%time
model = GaussianNB() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.7916
Wall time: 24 ms


In [25]:
model

GaussianNB(priors=None, var_smoothing=1e-09)

### DecisionTreeClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [26]:
%%time
model = DecisionTreeClassifier(max_leaf_nodes=10) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.7245
Wall time: 139 ms


### RandomForestClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [27]:
%%time
model = RandomForestClassifier(n_estimators=100) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9256
Wall time: 6.41 s


### ExtraTreesClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [28]:
%%time
model = ExtraTreesClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9384
Wall time: 2.39 s


### KNeighborsClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [29]:
%%time
model = KNeighborsClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9493
Wall time: 762 ms


In [47]:
%%time
# KNeighborsClassifier Loop
for x in range(2,15):
    
    model = KNeighborsClassifier(n_neighbors=x, algorithm='brute') # change hyperparameters as desired
    score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy with {x} neighbors = {score:0.4f}')

Mean cross-validation accuracy with 2 neighbors = 0.9222
Mean cross-validation accuracy with 3 neighbors = 0.9468
Mean cross-validation accuracy with 4 neighbors = 0.9466
Mean cross-validation accuracy with 5 neighbors = 0.9493
Mean cross-validation accuracy with 6 neighbors = 0.9499
Mean cross-validation accuracy with 7 neighbors = 0.9508
Mean cross-validation accuracy with 8 neighbors = 0.9502
Mean cross-validation accuracy with 9 neighbors = 0.9501
Mean cross-validation accuracy with 10 neighbors = 0.9493
Mean cross-validation accuracy with 11 neighbors = 0.9485
Mean cross-validation accuracy with 12 neighbors = 0.9470
Mean cross-validation accuracy with 13 neighbors = 0.9472
Mean cross-validation accuracy with 14 neighbors = 0.9463
Wall time: 16.2 s


### LogisticRegression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [31]:
%%time
model = LogisticRegression(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.7766
Wall time: 210 ms


### SVC

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [43]:
%%time
model = SVC() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=5).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9472
Wall time: 7.42 s


In [33]:
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### MLPClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [34]:
%%time
model = MLPClassifier(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9486
Wall time: 38.3 s


In [52]:
%%time
for c in [2]:
    model = SVC(C=c, gamma='auto')
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    print(f'Mean cross-validation accuracy in {c} = {score:0.4f}')

Mean cross-validation accuracy in 2 = 0.9573
Wall time: 16.3 s


In [50]:
model

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [55]:
chosen_model = SVC(C=2, gamma='auto')
chosen_model.fit(X_train, y_train) # train selected model on ALL training examples
predicted = chosen_model.predict(X_test) # predicted diagnosis for test examples
acc = accuracy_score(y_test, predicted) # accuracy on test samples
print(f'Accuracy on test samples = {acc:0.4f}') # show test accuracy
print("Classification report on test samples:") # for precision, recall, F1-score
print(classification_report(y_test, predicted, digits=4)) # rounded to 4 decimal places
cr = pd.DataFrame(classification_report(y_test, predicted, digits=4, output_dict=True))
cr.to_csv('Final_Q3_Task_3.csv')
cr
cm = pd.DataFrame(confusion_matrix(y_test, predicted))
cm.to_csv('cm.final.q3.csv')
cm

Accuracy on test samples = 0.9685
Classification report on test samples:
              precision    recall  f1-score   support

         0.0     0.9636    0.9740    0.9688       653
         1.0     0.9788    0.9744    0.9766       664
         2.0     0.9632    0.9575    0.9604       683

    accuracy                         0.9685      2000
   macro avg     0.9685    0.9686    0.9686      2000
weighted avg     0.9685    0.9685    0.9685      2000



Unnamed: 0,0,1,2
0,636,2,15
1,7,647,10
2,17,12,654


In [39]:
y_test

0       0.0
1       1.0
2       1.0
3       1.0
4       2.0
       ... 
1995    0.0
1996    2.0
1997    2.0
1998    1.0
1999    2.0
Name: y, Length: 2000, dtype: float64

In [58]:
predicted_new = chosen_model.predict(X_new) # predicted classes for unlabeled samples
new_prediction = pd.DataFrame() # dataframe with predicted classes
new_prediction['ID'] = new.ID # identifiers for unlabeled samples
new_prediction['y'] = predicted_new # # predicted classes for unlabeled samples
new_prediction # display results

Unnamed: 0,ID,y
0,ID_001,0.0
1,ID_002,0.0
2,ID_003,0.0
3,ID_004,1.0
4,ID_005,1.0
5,ID_006,1.0
6,ID_007,1.0
7,ID_008,1.0
8,ID_009,1.0
9,ID_010,1.0
