In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('layer_9_train.csv')
valid = pd.read_csv('layer_9_valid.csv')
test = pd.read_csv('layer_9_test.csv')

In [3]:
train.shape

(28520, 772)

In [4]:
train[train['label_1'].notna()].shape

(28520, 772)

So, there are NO MISSING VALUES in the train data set for label 1

In [6]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_1']

valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_1']

test_X = test.copy()
#test_X.drop(['ID'], axis=1, inplace=True)

Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

Training the model using SVM

In [10]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_X, train_y)

pred_y = svc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9613333333333334

Training the model using KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(train_X, train_y)

pred_y = knn.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8333333333333334

Training the model using Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_X, train_y)

pred_y = rfc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.876

Training the model using XGB

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

In [14]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.9

Accuracy scores before feature engineering,

SVM => 0.9613, 
KNN => 0.8333, 
Random Forest => 0.8866, 
XGB => 0.9000

therefore before feature engineering,
SVM > XGB > Random Forest > KNN

Feature Engineering and Model training

(i) Correlation-Based Feature Selection (CFS)

In [15]:
train_X.shape

(28520, 768)

In [16]:
test_X.shape

(750, 772)

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

num_features_to_select = 500
feature_selector = SelectKBest(score_func=f_classif, k=num_features_to_select)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

svc_new = SVC(kernel='linear')
svc_new.fit(train_X_new, train_y)

pred_y = svc_new.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9493333333333334

(ii) Variance-based Feature Selection

In [35]:
from sklearn.feature_selection import VarianceThreshold

feature_selector = VarianceThreshold(1.0)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [36]:
train_X_new.shape

(28520, 239)

In [30]:
from sklearn.svm import SVC

svc_new = SVC(kernel='linear')
svc_new.fit(train_X_new, train_y)

pred_y = svc_new.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9053333333333333

(iii) Principal Component Analysis (PCA)

In [15]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA(0.97)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

svm_new = SVC(kernel='linear')
svm_new.fit(train_X_new, train_y)

pred_y = svm_new.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9466666666666667

In [17]:
train_X_new.shape

(28520, 401)

Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.9493 => new features = 500, 
(ii) Variance-based Feature Selection => accuracy = 0.9053 => new features = 239, 
(iii) PCA => accuracy = 0.9466 => new features = 401, 

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 1

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

num_features_to_select = 500
feature_selector = SelectKBest(score_func=f_classif, k=num_features_to_select)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [35]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA(0.97)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [36]:
train_X_new.shape

(28520, 401)

Hyper-parameter Optimization using Randomized Search

In [39]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC()
param_distr = {'C':[0.1,1,10], 'kernel':['poly', 'linear']}

randomized_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distr, n_iter=5, cv=2)
randomized_search.fit(train_X_new, train_y)

In [40]:
randomized_search.best_params_

{'kernel': 'linear', 'C': 10}

In [41]:
best_svm_model = randomized_search.best_estimator_

Cross Validation

In [45]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_svm_model, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Cross-Validation Scores: [0.94284712 0.94232118 0.94547686 0.93723703 0.93863955]
Mean Accuracy = 0.941304347826087
Standard Deviation = 0.0029822203611451535


Predicting label 1 values for test data set

In [42]:
pred_test_y = best_svm_model.predict(test_X_new)
pred_test_y.shape

(744,)

In [43]:
test_X_new.shape

(744, 401)

In [44]:
rows, columns = test_X_new.shape

new_columns = {
    'label_1' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_1_new.csv'
data_frame.to_csv(csv_file_path, index=False)