In [1]:
import numpy as np
import pandas as pd 
import os 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Lasso

In [2]:
# Set the max number of columns to be displayed in head
pd.set_option('display.max_columns', None)

# or specify the number of columns
# pd.options.display.max_columns = 5

# Get the current working directory
cwd = os.getcwd()

# Print the current working directory
print("Current working directory: {0}".format(cwd))

# Get the current working directory
cwd = os.getcwd()

# Print the current working directory
print("Current working directory: {0}".format(cwd))
#%%

Current working directory: /Users/hesurina/Desktop
Current working directory: /Users/hesurina/Desktop


In [3]:
# Read the data in
hsls = pd.read_csv('hsls_final3.csv',
                   low_memory=False,
                   encoding='utf-8',
                   index_col=0)

print(hsls)

       X4PSENRSTLV  X3TGPATOT   X1SES  S1FRNDCLG
1                2        3.5  1.5644          1
2                0        4.0 -0.3699          2
3                2        3.0  1.2741          1
5                0        2.5 -0.4300          1
6                2        3.5  1.5144          1
...            ...        ...     ...        ...
10183            2        4.0  1.2556          1
10184            2        4.0  1.0526          1
10185            2        4.0  1.2033          1
10187            1        2.5 -0.0649          1
10188            1        2.0  0.8512          1

[8367 rows x 4 columns]


In [4]:
y = hsls.loc[:,"X4PSENRSTLV"]
print(y)

1        2
2        0
3        2
5        0
6        2
        ..
10183    2
10184    2
10185    2
10187    1
10188    1
Name: X4PSENRSTLV, Length: 8367, dtype: int64


In [5]:
# Splitting the dataset
X = hsls.loc[:,hsls.columns !='X4PSENRSTLV']
y = hsls.loc[:,"X4PSENRSTLV"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y)

In [6]:
# Initializing Classifiers
clf1 = LogisticRegression(multi_class='multinomial',
                          solver='newton-cg',
                          random_state=1)
clf2 = SVC(random_state=1)

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('clf1', clf1)])
pipe2 = Pipeline([('std', StandardScaler()),
                  ('clf2', clf2)])

In [7]:
# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l2'],
                'clf1__C': np.power(10., np.arange(-4, 4))}]
param_grid2 = [{'clf2__kernel': ['rbf'],
                'clf2__C': np.power(10., np.arange(-4, 4)),
                'clf2__gamma': np.power(10., np.arange(-5, 0))},
               {'clf2__kernel': ['linear'],
                'clf2__C': np.power(10., np.arange(-4, 4))}]

In [8]:
# Setting up inner loop
# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) 

for pgrid, est, name in zip((param_grid1, param_grid2),
                            (pipe1, pipe2),
                            ('Logistic_regression','SVM')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=-1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv
    gcv.fit(X_train, y_train)
    best_parameters = gcv.best_params_
    print(best_parameters)
    best_result = gcv.best_score_
    print(best_result)
    # plot = validation_curve(est, X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3),cv=5)

{'clf1__C': 1.0, 'clf1__penalty': 'l2'}
0.6850426048040225
{'clf2__C': 1000.0, 'clf2__gamma': 0.1, 'clf2__kernel': 'rbf'}
0.6845936161448374


In [9]:
## We can skip the next step because we set refit=True
## so scikit-learn has already fit the model to the
## whole training set

# best_model.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=gcv.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=gcv.predict(X_test))

print('Accuracy %.2f%% (average over k-fold CV test folds)' %
      (100 * gcv.best_score_))
print('Best Parameters: %s' % gcv.best_params_)

print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 68.46% (average over k-fold CV test folds)
Best Parameters: {'clf2__C': 1000.0, 'clf2__gamma': 0.1, 'clf2__kernel': 'rbf'}
Training Accuracy: 68.71%
Test Accuracy: 67.92%
