# Q3 Using Scikit-Learn

In [1]:
import numpy as np
import pandas as pd
import time
import gc

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
# Import statements run before running other code cells

# Classifier Setup

In [2]:
# XXX
# TODO: Read in all the data. Replace the 'xxx' with the path to the data set.
# XXX

# -------------------------------
data = pd.read_csv('pulsar_stars.csv')

# Separate out the x_data and y_data.
x_data = data.loc[:, data.columns != "y"]
y_data = data.loc[:, "y"]
# -------------------------------

In [3]:
# The random state to use while splitting the data.
# random_state = 100

# XXX
# TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
# Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 100.
# XXX

# -------------------------------
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.30, random_state=100)
# -------------------------------

# Linear Regression 

In [4]:
# XXX
# TODO: Create a LinearRegression classifier and train it.
# XXX

# -------------------------------
reg= LinearRegression().fit(x_train, y_train)
# -------------------------------

In [5]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
train_pred= reg.predict(x_train)
train_pred[train_pred >=0.5]=1
train_pred[train_pred <0.5]=0
accuracy=accuracy_score(train_pred,y_train)
print(accuracy)
# -------------------------------

0.970705619412516


In [6]:
# XXX
# TODO: Test its accuracy (on the testing set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
test_pred= reg.predict(x_test)
test_pred[test_pred >=0.5]=1
test_pred[test_pred <0.5]=0
accuracy=accuracy_score(test_pred,y_test)
print(accuracy)
# -------------------------------

0.9720670391061452


# Random Forest Classifier

In [7]:
# XXX
# TODO: Create a RandomForestClassifier and train it.
# WARNING: Ignore "FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22."10 in version 0.20 to 100 in 0.22."
# XXX

# -------------------------------
clf = RandomForestClassifier()
clf.fit(x_train, y_train) 
# -------------------------------



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX

# -------------------------------
train_pred= clf.predict(x_train)
accuracy= accuracy_score(train_pred,y_train)
print(accuracy)
# -------------------------------

0.9960089399744572


In [9]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX

# -------------------------------
test_pred= clf.predict(x_test)
accuracy= accuracy_score(test_pred,y_test)
print(accuracy)
# -------------------------------

0.9817504655493482


## Feature Importance

In [10]:
# XXX
# TODO: Determine and print the feature importance as evaluated by the Random Forest Classifier.
# XXX

# -------------------------------
importances=clf.feature_importances_
print(importances)
# -------------------------------

[0.11075096 0.08272287 0.17523351 0.37996186 0.06546062 0.08687811
 0.06604751 0.03294456]


In [11]:
# XXX
# TODO: Sort them in the descending order and print the feature numbers[0 to ...].
#       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
# XXX

# -------------------------------
imp_args=np.argsort(importances)[::-1] 
print(imp_args)
# -------------------------------

[3 2 0 5 1 6 4 7]


## Hyper-parameter Tuning

In [12]:
# XXX
# TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
# XXX
# -------------------------------
md=np.arange(17, 26, 1)
ne = np.arange(40,200, 40)
parameters = {'n_estimators':ne, 'max_depth':md}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters, cv=10)
clf.fit(x_train, y_train)
# -------------------------------

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([ 40,  80, 120, 160]), 'max_depth': array([17, 18, 19, 20, 21, 22, 23, 24, 25])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
# XXX
# TODO: Print the best params, using .best_params_
# XXX

# -------------------------------
best_p= clf.best_params_
print(best_p)
# -------------------------------

{'max_depth': 17, 'n_estimators': 160}


In [14]:
# XXX
# TODO: Print the best score, using .best_score_.
# XXX

# -------------------------------
best_s= clf.best_score_
print(best_s)
# -------------------------------

0.979007024265645


# Support Vector Machine

## Pre-process

In [15]:
# XXX
# TODO: Pre-process the data to standardize or normalize it, otherwise the grid search will take much longer.
# Warning: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning"
# XXX

# -------------------------------
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)
# -------------------------------

In [16]:
# XXX
# TODO: Create a SVC classifier and train it.
# XXX

# -------------------------------
clf = SVC()
clf.fit(x_train_norm, y_train)
# -------------------------------



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [17]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX
# WARNING: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.""

# -------------------------------
train_pred= clf.predict(x_train_norm)
accuracy= accuracy_score(train_pred,y_train)
print(accuracy)
# -------------------------------

0.966794380587484


In [18]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX
# WARNING: Ignore "FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.""

# -------------------------------
test_pred= clf.predict(x_test_norm)
accuracy= accuracy_score(test_pred,y_test)
print(accuracy)
# -------------------------------

0.970391061452514


## Hyper-parameter Tuning

In [19]:
# XXX
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
# XXX

# -------------------------------
parameters = {'kernel':('rbf', 'linear'), 'C':[0.01,0.1,1,10,100]}
svm_clf = SVC()
clf = GridSearchCV(svm_clf, parameters, cv=10)
clf.fit(x_train_norm, y_train)
# -------------------------------





GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('rbf', 'linear'), 'C': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
# XXX
# TODO: Print the best score, using .best_score_.
# XXX

# -------------------------------
best_s= clf.best_score_
print(best_s)
# -------------------------------

0.9740581098339719


In [21]:
# XXX
# TODO: Get the training and test set accuracy values after hyperparameter tuning.
# XXX

# -------------------------------
train_pred=clf.predict(x_train_norm)
test_pred=clf.predict(x_test_norm)
# -------------------------------

In [22]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
train_pred[train_pred >=0.5]=1
train_pred[train_pred <0.5]=0
accuracy=accuracy_score(train_pred,y_train)
print(accuracy)
# -------------------------------

0.9742177522349936


In [23]:
# XXX
# TODO: Test its accuracy (on the testing set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
test_pred[test_pred >=0.5]=1
test_pred[test_pred <0.5]=0
accuracy=accuracy_score(test_pred,y_test)
print(accuracy)
# -------------------------------

0.9774674115456239


In [24]:
# XXX
# TODO: Calculate the rank test score, mean testing score and mean fit time for the 
# best combination of hyperparameter values that you obtained in Q3.2. The GridSearchCV 
# class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
index=clf.cv_results_['params'].index(clf.best_params_)

rts= clf.cv_results_['rank_test_score'][index]
mts= clf.cv_results_['mean_test_score'][index]
mft= clf.cv_results_['mean_fit_time'][index]
# -------------------------------

In [25]:
# XXX
# TODO: Print the rank test score for the best combination of hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
print(rts)
# -------------------------------

1


In [26]:
# XXX
# TODO: Print mean testing score for the best combination of hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
print(mts)
# -------------------------------

0.9740581098339719


In [27]:
# XXX
# TODO: Print mean fit time for the best combination of hyperparameter values that you obtained in Q3.2. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
print(mft)
# -------------------------------

0.21341090202331542


# PCA

In [28]:
# XXX
# TODO: Perform dimensionality reduction of the data using PCA.
#       Set parameters n_component to 8 and svd_solver to 'full'. Keep other parameters at their default value.
# XXX

# -------------------------------
pca = PCA(n_components=8, svd_solver='full')
pca.fit(x_train)
# -------------------------------

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [29]:
# XXX
# TODO: Print Percentage of variance explained by each of the selected components
# XXX

# -------------------------------
per_var=pca.explained_variance_ratio_
print(per_var)
# -------------------------------

[8.71054855e-01 7.80404050e-02 4.12747422e-02 6.15520409e-03
 2.45425694e-03 9.78533643e-04 3.91429289e-05 2.85984066e-06]


In [30]:
# XXX
# TODO: The singular values corresponding to each of the selected components.
# XXX

# -------------------------------
sv=pca.singular_values_
print(sv)
# -------------------------------

[11992.97512747  3589.74779466  2610.63625426  1008.15059028
   636.59642313   401.9685368     80.39533321    21.73076915]
