# Classification - Part 2 - Problem 1

In [1]:
# Number of concurrent jobs
n_jobs = 4

# Load the data

In [2]:
import numpy as np

X_train_ini = np.load('Xtrain_Classification_Part1.npy')
Y_train_ini = np.load('Ytrain_Classification_Part1.npy')
X_test_out = np.load('Xtest_Classification_Part1.npy')


# Standardize features by removing the mean and scaling to unit variance.

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_ini)
scaler.fit(X_test_out)

x_train_scaled = scaler.transform(X_train_ini)
x_test_out_scaled = scaler.transform(X_test_out)

# Run several models using lazypredict library

In [4]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pickle as pik

X_train, X_test, y_train, y_test = train_test_split(
    x_train_scaled, Y_train_ini, test_size=.2, random_state=42)

# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

with open(r"lazyModels.pik", "wb") as output_file:
    pik.dump(models, output_file)


100%|██████████| 29/29 [11:55<00:00, 24.68s/it]


In [5]:
# Printing all model performances
models


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.86,0.85,0.85,0.86,67.05
NuSVC,0.85,0.85,0.85,0.85,84.02
XGBClassifier,0.85,0.84,0.84,0.85,85.56
LogisticRegression,0.84,0.84,0.84,0.84,2.9
LGBMClassifier,0.84,0.83,0.83,0.84,43.9
Perceptron,0.83,0.82,0.82,0.83,1.34
RidgeClassifierCV,0.83,0.82,0.82,0.83,25.58
CalibratedClassifierCV,0.82,0.81,0.81,0.82,67.77
SGDClassifier,0.82,0.81,0.81,0.82,3.2
LinearSVC,0.81,0.8,0.8,0.81,17.83


# Define cross-validation method

In [6]:
from sklearn.model_selection import RepeatedKFold

random_state = 1

# define model evaluation method (repeats k-folds n times, with k-folds=n_splits and n=n_repeats)
#cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=random_state)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=random_state)


# SVM classification model

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pickle as pik

svc_model = SVC(random_state=1)

# Tuning: 
# parameters = {'kernel': ['rbf', 'poly'],
#               'C': [0.5, 0.9, 1.0, 10, 100],
#               'degree': [1, 2, 3],
#               'gamma': [ 0.01, 0.001, 0.0001]}
# Result:
# best_params_: {'C': 10, 'degree': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
# best_score_: 0.8687789799072643

# After tuning with scoring='accuracy'
# best_params_: {'C': 30, 'degree': 1, 'gamma': 9.2e-05, 'kernel': 'rbf'}
# best_score_: 0.8726429675425038

parameters = {'kernel': ['rbf'],
              'C': [30],
              'degree': [1],
              'gamma': [0.000092]}

# scoring='balanced_accuracy'
svc_grid = GridSearchCV(svc_model, parameters,
                        scoring='balanced_accuracy', 
                        cv=cv, 
                        verbose=1, 
                        n_jobs=n_jobs)
svc_grid.fit(x_train_scaled, Y_train_ini)

with open(r"svc_grid.pik", "wb") as output_file:
    pik.dump(svc_grid, output_file)


Fitting 30 folds for each of 1 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 10.6min finished


In [8]:
print('best_params_:', svc_grid.best_params_)
print('best_estimator_:', svc_grid.best_estimator_)
print('best_score_:', svc_grid.best_score_)


best_params_: {'C': 30, 'degree': 1, 'gamma': 9.2e-05, 'kernel': 'rbf'}
best_estimator_: SVC(C=30, degree=1, gamma=9.2e-05, random_state=1)
best_score_: 0.8693478959787003


# Run SVM NuSVC model

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import NuSVC
import pickle as pik

nusvc_model = NuSVC(random_state=1)

# Parameters after tuning
parameters = {'kernel': ['rbf'],
              'nu': [0.3],
              'degree': [1],
              'gamma': [0.0001]}

# scoring='balanced_accuracy'
nusvc_grid = GridSearchCV(nusvc_model, parameters,
                          scoring='balanced_accuracy', 
                          cv=cv, 
                          verbose=1, 
                          n_jobs=n_jobs)
nusvc_grid.fit(x_train_scaled, Y_train_ini)

with open(r"nusvc_grid.pik", "wb") as output_file: 
    pik.dump(nusvc_grid, output_file)

Fitting 30 folds for each of 1 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 13.5min finished


In [10]:
print('best_params_:', nusvc_grid.best_params_)
print('best_estimator_:', nusvc_grid.best_estimator_)
print('best_score_:', nusvc_grid.best_score_)

best_params_: {'degree': 1, 'gamma': 0.0001, 'kernel': 'rbf', 'nu': 0.3}
best_estimator_: NuSVC(degree=1, gamma=0.0001, nu=0.3, random_state=1)
best_score_: 0.8690068537450027


# Run XGBoost model

In [11]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import pickle as pik

xgb_model = XGBClassifier(random_state=1)

parameters = {'objective':['binary:logistic']}

# scoring='balanced_accuracy'
xgb_grid = GridSearchCV(xgb_model, parameters, scoring='balanced_accuracy', cv=cv, verbose=1, n_jobs=n_jobs)
xgb_grid.fit(x_train_scaled, Y_train_ini)

with open(r"xgb_grid.pik", "wb") as output_file:
    pik.dump(xgb_grid, output_file)

Fitting 30 folds for each of 1 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 42.1min finished


In [12]:
print('best_params_:', xgb_grid.best_params_)
print('best_estimator_:', xgb_grid.best_estimator_)
print('best_score_:', xgb_grid.best_score_)

best_params_: {'objective': 'binary:logistic'}
best_estimator_: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
best_score_: 0.8432984173769889


# Voting method with 3 best models using predicted probability

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, NuSVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

clf1 = SVC(probability=True, random_state=1)
clf2 = NuSVC(probability=True, random_state=1)

# binary:logistic – logistic regression for binary classification, returns predicted probability (not class)
clf3 = XGBClassifier(objective='binary:logistic', random_state=1)

eclf2 = VotingClassifier(estimators=[('svc', clf1), ('nusvc', clf2), ('xgb', clf3)], 
                         n_jobs=n_jobs, voting='soft')

# best_params_: {'C': 30, 'degree': 1, 'gamma': 9.2e-05, 'kernel': 'rbf'}
# best_estimator_: SVC(C=30, degree=1, gamma=9.2e-05, random_state=1)
# best_score_: 0.8726429675425038

# best_params_: {'degree': 1, 'gamma': 0.00015, 'kernel': 'rbf', 'nu': 0.3}
# best_estimator_: NuSVC(degree=1, gamma=0.00015, nu=0.3, random_state=1)
# best_score_: 0.8700154559505411

params = {'svc__C': [30], 'svc__degree':[1], 'svc__gamma': [0.000092], 'svc__kernel':['rbf'],
          'nusvc__nu': [0.3], 'nusvc__degree': [1], 'nusvc__gamma': [0.0001]} #'nusvc__gamma': [0.00015]

# scoring='balanced_accuracy'
grid2 = GridSearchCV(estimator=eclf2, param_grid=params,
                     scoring='balanced_accuracy', 
                     cv=cv, n_jobs=n_jobs)

grid2.fit(x_train_scaled, Y_train_ini)

with open(r"grid_voting2.pik", "wb") as output_file:
    pik.dump(grid2, output_file)


In [14]:
print('best_params_:', grid2.best_params_)
print('best_estimator_:', grid2.best_estimator_)
print('best_score_:', grid2.best_score_)

best_params_: {'nusvc__degree': 1, 'nusvc__gamma': 0.0001, 'nusvc__nu': 0.3, 'svc__C': 30, 'svc__degree': 1, 'svc__gamma': 9.2e-05, 'svc__kernel': 'rbf'}
best_estimator_: VotingClassifier(estimators=[('svc',
                              SVC(C=30, degree=1, gamma=9.2e-05,
                                  probability=True, random_state=1)),
                             ('nusvc',
                              NuSVC(degree=1, gamma=0.0001, nu=0.3,
                                    probability=True, random_state=1)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interac..._constraints=None

# Output: Best result was achieved with voting for 3 best models 

In [15]:
from sklearn.model_selection import GridSearchCV
# import pickle as pik
# with open(r"svc_grid.pik", "rb") as f:
#     svc_grid = pik.load(f)

y_test = grid2.predict(x_test_out_scaled)
np.save('Ytest_Classification_Part1.npy', y_test)