# Q3 Using Scikit-Learn

In [1]:
import numpy as np
import pandas as pd
import time
import gc

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
# Import statements run before running other code cells

  from numpy.core.umath_tests import inner1d


# Classifier Setup

In [2]:
# XXX
# TODO: Read in all the data. Replace the 'xxx' with the path to the data set.
# XXX

# data = pd.read_csv('pulsar_stars.csv')

# Separate out the x_data and y_data.
# x_data = data.loc[:, data.columns != "y"]
# y_data = data.loc[:, "y"]

# -------------------------------
# ADD CODE HERE
data = pd.read_csv('pulsar_stars.csv')

x_data = data.loc[:, data.columns != "y"]
y_data = data.loc[:, "y"]
# -------------------------------

In [3]:
# The random state to use while splitting the data.
# random_state = 100

# XXX
# TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
# Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 100.
# XXX

# -------------------------------
# ADD CODE HERE
random_state = 100

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3,\
                                                    random_state=random_state, shuffle=True)

# -------------------------------

# Linear Regression 

In [4]:
# XXX
# TODO: Create a LinearRegression classifier and train it.
# XXX

# -------------------------------
# ADD CODE HERE
LR = LinearRegression()
LR.fit(x_train, y_train)
# -------------------------------

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
# ADD CODE HERE
LR_y_pred_train = LR.predict(x_train)
LR_y_pred_train_round = [0 if i<0.5 else 1 for i in LR_y_pred_train]
LR_train_accuracy = 100 * accuracy_score(y_train, LR_y_pred_train_round)
print("Linear regression accuracy on the training set: %.2f%%" %LR_train_accuracy)
# -------------------------------

Linear regression accuracy on the training set: 97.07%


In [6]:
# XXX
# TODO: Test its accuracy (on the testing set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
# ADD CODE HERE
LR_y_pred_test = LR.predict(x_test)
LR_y_pred_test_round = [0 if i<0.5 else 1 for i in LR_y_pred_test]
LR_test_accuracy = 100*accuracy_score(y_test, LR_y_pred_test_round)
print("Linear regression accuracy on the test set: %.2f%%" %LR_test_accuracy)
# -------------------------------

Linear regression accuracy on the test set: 97.21%


# Random Forest Classifier

In [7]:
# XXX
# TODO: Create a RandomForestClassifier and train it.
# WARNING: Ignore "FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22."10 in version 0.20 to 100 in 0.22."
# XXX

# -------------------------------
# ADD CODE HERE
RF = RandomForestClassifier(random_state=random_state)
RF.fit(x_train, y_train)  
# -------------------------------

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False)

In [8]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
RF_y_pred_train = RF.predict(x_train)
RF_y_pred_train_round = [0 if i<0.5 else 1 for i in RF_y_pred_train]
RF_train_accuracy = 100 * accuracy_score(y_train, RF_y_pred_train_round)
print("Random forest accuracy on the training set: %.2f%%" %RF_train_accuracy)
# -------------------------------

Random forest accuracy on the training set: 99.61%


In [9]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
RF_y_pred_test = RF.predict(x_test)
RF_y_pred_test_round = [0 if i<0.5 else 1 for i in RF_y_pred_test]
RF_test_accuracy = 100*accuracy_score(y_test, RF_y_pred_test_round, normalize=True)
print("Random forest accuracy on the test set: %.2f%%" %RF_test_accuracy)
# -------------------------------

Random forest accuracy on the test set: 98.10%


## Feature Importance

In [10]:
# XXX
# TODO: Determine and print the feature importance as evaluated by the Random Forest Classifier.
# XXX

# -------------------------------
# ADD CODE HERE
feature_importance = RF.feature_importances_
print(feature_importance)
# -------------------------------

[0.1257031  0.03131994 0.45697513 0.16258565 0.05165992 0.08903044
 0.05408017 0.02864564]


In [11]:
# XXX
# TODO: Sort them in the descending order and print the feature numbers[0 to ...].
#       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
# XXX

# -------------------------------
# ADD CODE HERE
indexes = np.argsort(feature_importance)[::-1]
for f in range(x_train.shape[1]):
    print("X%d  %f" % (indexes[f], feature_importance[indexes[f]]))
# -------------------------------

X2  0.456975
X3  0.162586
X0  0.125703
X5  0.089030
X6  0.054080
X4  0.051660
X1  0.031320
X7  0.028646


## Hyper-parameter Tuning

In [12]:
# XXX
# TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
# XXX

# -------------------------------
# ADD CODE HERE
parameters = {'n_estimators':[4, 8, 12, 16], 'max_depth':[2, 5, 10, 25]}
RF_tuning = GridSearchCV(RF, parameters, cv=10, refit=True, scoring='accuracy')
RF_tuning.fit(x_train, y_train)
RF_max_test_accuracy = accuracy_score(RF_tuning.predict(x_test), y_test)
print("Testing accuracy for random forest after tuning: " + str(100*round(RF_max_test_accuracy,4)) + "%")
# -------------------------------

Testing accuracy for random forest after tuning: 98.1%


In [13]:
# XXX
# TODO: Print the best params, using .best_params_
# XXX

# -------------------------------
# ADD CODE HERE
print(RF_tuning.best_params_)
# -------------------------------

{'max_depth': 10, 'n_estimators': 12}


In [14]:
# XXX
# TODO: Print the best score, using .best_score_.
# XXX

# -------------------------------
# ADD CODE HERE
print(RF_tuning.best_score_)
# -------------------------------

0.9787675606641124


# Support Vector Machine

## Pre-process

In [16]:
# XXX
# TODO: Pre-process the data to standardize or normalize it, otherwise the grid search will take much longer.
# Warning: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning"
# XXX

# -------------------------------
# ADD CODE HERE
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)
# -------------------------------

In [17]:
# XXX
# TODO: Create a SVC classifier and train it.
# XXX

# -------------------------------
# ADD CODE HERE
SVM = SVC(gamma='auto') 
SVM.fit(x_train_norm, y_train)
# -------------------------------

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX
# WARNING: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.""

# -------------------------------
# ADD CODE HERE
SVM_y_pred_train= SVM.predict(x_train_norm)
SVM_y_pred_train_round = [0 if i<0.5 else 1 for i in SVM_y_pred_train]
SVM_train_accuracy = 100*accuracy_score(y_train, SVM_y_pred_train_round, normalize=True)
print("SVM accuracy on the test set: %.2f%%" %SVM_train_accuracy)
# -------------------------------

SVM accuracy on the test set: 96.68%


In [19]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX
# WARNING: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.""

# -------------------------------
# ADD CODE HERE
SVM_y_pred_test = SVM.predict(x_test_norm)
SVM_y_pred_test_round = [0 if i<0.5 else 1 for i in SVM_y_pred_test]
SVM_test_accuracy = 100*accuracy_score(y_test, SVM_y_pred_test_round, normalize=True)
print("SVM accuracy on the test set: %.2f%%" %SVM_test_accuracy)
# -------------------------------

SVM accuracy on the test set: 97.04%


## Hyper-parameter Tuning

In [20]:
# XXX
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
# XXX

# -------------------------------
# ADD CODE HERE
parameters_svm = {'kernel':['rbf','linear'], 'C':[0.001,0.0001, 0.01,0.00001]}
SVM_tuning = GridSearchCV(SVC(), parameters_svm, cv=10, return_train_score=True)
SVM_tuning.fit(x_train_norm, y_train)
# -------------------------------

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf', 'linear'], 'C': [0.001, 0.0001, 0.01, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [21]:
# XXX
# TODO: Print the best score, using .best_score_.
# XXX

# -------------------------------
# ADD CODE HERE
print(SVM_tuning.best_params_)
print(SVM_tuning.best_score_)
# -------------------------------

{'C': 0.01, 'kernel': 'linear'}
0.9545019157088123


# Support Vector Machine

In [22]:
# XXX
# TODO: Calculate the training and test set accuracy values after hyperparameter tuning and normalization. 
# XXX

# -------------------------------
# ADD CODE HERE
SVM_tuning_train_accuracy = accuracy_score(SVM_tuning.predict(x_train_norm), y_train)
SVM_tuning_test_accuracy = accuracy_score(SVM_tuning.predict(x_test_norm), y_test)
# -------------------------------

In [23]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method. Print the result
# XXX

# -------------------------------
# ADD CODE HERE
print("Training accuracy for SVM after tuning: " + str(100*round(SVM_tuning_train_accuracy,4)) + "%")
# -------------------------------

Training accuracy for SVM after tuning: 95.46%


In [24]:
# XXX
# TODO: Test its accuracy (on the testing set) using the accuracy_score method. Print the result
# XXX

# -------------------------------
# ADD CODE HERE
print("Testing accuracy for SVM after tuning: " + str(100*round(SVM_tuning_test_accuracy,4)) + "%")
# -------------------------------

Testing accuracy for SVM after tuning: 95.59%


In [25]:
# XXX
# TODO: Calculate the rank test score, mean testing score and mean fit time for the 
# all hyperparameter values that you obtained in Q3.2. The GridSearchCV 
# class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
# ADD CODE HERE
result = SVM_tuning.cv_results_
rank_test_score = result['rank_test_score']
mean_test_score = result['mean_test_score']
mean_fit_time = result['mean_fit_time']
# -------------------------------

In [26]:
# XXX
# TODO: Print the rank test score for all hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
# ADD CODE HERE
print('The Rank Test Score for all hyperparameter values are ', rank_test_score)
# -------------------------------

The Rank Test Score for all hyperparameter values are  [2 2 2 2 2 1 2 2]


In [27]:
# XXX
# TODO: Print mean testing score for all of hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
# ADD CODE HERE
print('The mean testing score for all hyperparameter values are ', mean_test_score)
# -------------------------------

The mean testing score for all hyperparameter values are  [0.90876437 0.90876437 0.90876437 0.90876437 0.90876437 0.95450192
 0.90876437 0.90876437]


In [28]:
# XXX
# TODO: Print mean fit time for all of hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
# ADD CODE HERE
print('The mean fit time for all hyperparameter values are ', mean_fit_time)
# -------------------------------

The mean fit time for all hyperparameter values are  [1.0107832  0.58414142 1.01308992 0.53511817 0.98111222 0.55904825
 0.96959028 0.52344933]


# PCA

In [29]:
# XXX
# TODO: Perform dimensionality reduction of the data using PCA.
#       Set parameters n_component to 8 and svd_solver to 'full'. Keep other parameters at their default value.
# XXX

# -------------------------------
# ADD CODE HERE
PCA = PCA(n_components=8, svd_solver='full')
PCA.fit(x_train, y_train)  
# -------------------------------

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [30]:
# XXX
# TODO: Print Percentage of variance explained by each of the selected components
# XXX

# -------------------------------
# ADD CODE HERE
print(PCA.explained_variance_ratio_) 
# -------------------------------

[6.94399204e-01 2.53912066e-01 2.85011856e-02 1.02529970e-02
 6.89204024e-03 5.93958263e-03 9.07076820e-05 1.22170354e-05]


In [31]:
# XXX
# TODO: The singular values corresponding to each of the selected components.
# XXX

# -------------------------------
# ADD CODE HERE
print(PCA.singular_values_)  
# -------------------------------

[40.02811972 24.20484202  8.10946206  4.86391523  3.98781025  3.70201818
  0.45749107  0.16789722]
