## Homework 2 - Question 2

In [1]:
import pandas as pd # for data handling
from sklearn.model_selection import cross_val_score # for cross-validation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # evaluation metrics
from sklearn import set_config # enable configure
set_config(print_changed_only=False) # configure print to show all not just changed values
import matplotlib.pyplot as plt # for plotting

# scikit-learn classifiers evaluated (change as desired)
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
# Read data from CSV files into pandas dataframes
train = pd.read_csv('hw2.q2.train.csv') # training data
test = pd.read_csv('hw2.q2.test.csv') # test data
new = pd.read_csv('hw2.q2.new.csv') # unlabeled data
# Show number of rows and columns in each dataframe
print('train contains %d rows and %d columns' %train.shape)
print('test contains %d rows and %d columns' %test.shape)
print('new contains %d rows and %d columns' %new.shape)
print('First 3 rows in train:') 
train.head(3) # display first 3 training samples 

train contains 8000 rows and 11 columns
test contains 2000 rows and 11 columns
new contains 30 rows and 11 columns
First 3 rows in train:


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,2.0,-0.613,-1.465,-1.105,0.436,-1.658,-1.357,-2.375,-0.997,-0.653,1.186
1,1.0,-1.154,0.473,3.159,-4.77,0.402,-0.16,-1.925,-0.105,-2.304,0.032
2,2.0,0.147,-0.814,-0.792,-1.403,2.124,-2.263,-2.133,-2.461,-0.781,0.932


### Specify inputs and outputs
- **features**: List of the 16 input feature names
- **X_train**: $4000 \times 16$ array containing input values for training samples.
- **y_train**: Array containing labels for the 4000 training samples.
- **X_test**: $1000 \times 16$ array containing input values for test samples.
- **y_test**: Array containing labels for the 1000 training samples.
- **X_new**: $30 \times 16$ array containing input values for unlabeled samples.






In [3]:
features = list(train)[1:] # all but the first column header are feature names
print("features:", features)
X_train, X_test, X_new = train[features], test[features], new[features]
y_train, y_test = train.y, test.y
print('Shapes:')
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, X_new: {X_new.shape}')
print(f'y_train: {y_train.shape}, y_test: {y_test.shape}')

features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
Shapes:
X_train: (8000, 10), X_test: (2000, 10), X_new: (30, 10)
y_train: (8000,), y_test: (2000,)


## Evaluate models using *k*-fold cross-validation
We shall use **4**-fold cross-validation so that 3000 of the 4000 training samples are used for training and the remaining 1000 samples are used for validation in each fold. The mean cross-validation accuracy for each model with chosen hyper-parameters on the 4 runs will be computed using the command:
- **score = cross_val_score(model, X_train, y_train, cv=4).mean()**
> - *model*: classifier object with specified hyperparameters
> - *X_train*, *y_train*: Inputs and output labels for training
> - *cv*: number of folds in cross-validation
> - *mean*(): computes mean accuracy from the *cv* runs 

You can look up the documentation for each classifier, change hyper-parameter values, and observe the results. We shall also observe the time it takes to train and evaluate each model 4 times in this *4*-fold cross-validation process.


### GaussianNB

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [4]:
%%time
model = GaussianNB() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.8490
Wall time: 19 ms


In [5]:
model

GaussianNB(priors=None, var_smoothing=1e-09)

### DecisionTreeClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [6]:
%%time
model = DecisionTreeClassifier(max_leaf_nodes=10) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.7064
Wall time: 90 ms


### RandomForestClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [7]:
%%time
model = RandomForestClassifier(n_estimators=100) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9406
Wall time: 3.86 s


### ExtraTreesClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [8]:
%%time
model = ExtraTreesClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9520
Wall time: 1.53 s


### KNeighborsClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [9]:
%%time
model = KNeighborsClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9640
Wall time: 513 ms


In [10]:
%%time
# KNeighborsClassifier Loop
for x in range(2,15):
    
    model = KNeighborsClassifier(n_neighbors=11, algorithm='brute') # change hyperparameters as desired
    score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Mean cross-validation accuracy = 0.9626
Wall time: 10.3 s


### LogisticRegression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [11]:
%%time
model = LogisticRegression(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.8812
Wall time: 222 ms


### SVC

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [12]:
%%time
model = SVC() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9669
Wall time: 3.7 s


In [13]:
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### MLPClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [14]:
%%time
model = MLPClassifier(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=4).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9546
Wall time: 50 s


In [18]:
%%time
model = MLPClassifier(max_iter=100,
                      hidden_layer_sizes=(200,25), 
                      verbose=True) # change hyperparameters as desired
model.fit(X_train, y_train)

Iteration 1, loss = 0.77886345
Iteration 2, loss = 0.31621823
Iteration 3, loss = 0.24933180
Iteration 4, loss = 0.22173870
Iteration 5, loss = 0.20545505
Iteration 6, loss = 0.19349323
Iteration 7, loss = 0.18429350
Iteration 8, loss = 0.17682996
Iteration 9, loss = 0.17156879
Iteration 10, loss = 0.16552729
Wall time: 911 ms




MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200, 25), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=10,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [79]:
%%time
for c in [2]:
    model = SVC(C=c, gamma='auto')
    score = cross_val_score(model, X_train, y_train, cv=4).mean()
    print(f'Mean cross-validation accuracy in {c} = {score:0.4f}')

Mean cross-validation accuracy in 2 = 0.9715
Wall time: 9.29 s


In [71]:
model

SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [81]:
chosen_model = SVC(C=2, gamma='auto')
chosen_model.fit(X_train, y_train) # train selected model on ALL training examples
predicted = chosen_model.predict(X_test) # predicted diagnosis for test examples
acc = accuracy_score(y_test, predicted) # accuracy on test samples
print(f'Accuracy on test samples = {acc:0.4f}') # show test accuracy
print("Classification report on test samples:") # for precision, recall, F1-score
print(classification_report(y_test, predicted, digits=4)) # rounded to 4 decimal places
cr = pd.DataFrame(classification_report(y_test, predicted, digits=4, output_dict=True))
cr.to_csv('Homework_2_Q2_Task_2.csv')
cr
cm = pd.DataFrame(confusion_matrix(y_test, predicted))
cm.to_csv('cm.hw2.q2.csv')
cm

Accuracy on test samples = 0.9745
Classification report on test samples:
              precision    recall  f1-score   support

         0.0     0.9589    0.9800    0.9693       500
         1.0     0.9698    0.9659    0.9678       498
         2.0     0.9837    0.9797    0.9817       493
         3.0     0.9861    0.9725    0.9792       509

    accuracy                         0.9745      2000
   macro avg     0.9746    0.9745    0.9745      2000
weighted avg     0.9746    0.9745    0.9745      2000



Unnamed: 0,0,1,2,3
0,490,5,1,4
1,10,481,5,2
2,6,3,483,1
3,5,7,2,495


In [62]:
y_test

0       3.0
1       0.0
2       2.0
3       0.0
4       0.0
       ... 
1995    3.0
1996    2.0
1997    1.0
1998    3.0
1999    3.0
Name: y, Length: 2000, dtype: float64

In [65]:
predicted_new = chosen_model.predict(X_new) # predicted classes for unlabeled samples
new_prediction = pd.DataFrame() # dataframe with predicted classes
new_prediction['ID'] = new.ID # identifiers for unlabeled samples
new_prediction['y'] = predicted_new # # predicted classes for unlabeled samples
new_prediction # display results

Unnamed: 0,ID,y
0,ID_001,0.0
1,ID_002,0.0
2,ID_003,0.0
3,ID_004,0.0
4,ID_005,0.0
5,ID_006,0.0
6,ID_007,0.0
7,ID_008,0.0
8,ID_009,0.0
9,ID_010,0.0
