In [11]:
import pandas as pd
import numpy as np

In [12]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [3]:
print(train.shape)
print(train[train['label_4'].notna()].shape)

train = pd.DataFrame(train)
columns_with_missing_values = train.columns[train.isnull().any()]
print(columns_with_missing_values)

(28520, 772)
(28520, 772)
Index(['label_2'], dtype='object')


In [4]:
print(valid.shape)
print(valid[valid['label_4'].notna()].shape)

valid = pd.DataFrame(valid)

columns_with_missing_values = valid.columns[valid.isnull().any()]
print(columns_with_missing_values)

(750, 772)
(750, 772)
Index(['label_2'], dtype='object')


So, there are NO MISSING VALUES in both train and valid data set for label 4

In [13]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_4']

valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_4']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

Model training before feature engineering

Training the model using SVM

In [9]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_X, train_y)

pred_y = svc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.96

Training the model using XGB

In [10]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.9106666666666666

Accuracy scores before feature engineering,

SVM => 0.9600, 
XGB => 0.9106

therefore before feature engineering,
SVM > XGB

Feature Engineering and Model training

(i) Correlation-Based Feature Selection (CFS)

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

selected_features = [500]
accuracies = []

for selected_feature in selected_features:
    feature_selector = SelectKBest(score_func=f_classif, k=selected_feature)

    feature_selector.fit(train_X, train_y)

    train_X_new = feature_selector.transform(train_X)
    valid_X_new = feature_selector.transform(valid_X)
    test_X_new = feature_selector.transform(test_X)

    svm_new = SVC(kernel='linear')
    svm_new.fit(train_X_new, train_y)

    pred_y = svm_new.predict(valid_X_new)

    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)

results = pd.DataFrame({'selected_features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,selected_features,accuracy
0,500,0.950667


(ii) Principal Component Analysis (PCA)

In [17]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

components = [0.975]
selected_features = []
accuracies = []

for component in components:
    pca = PCA(component)
    pca.fit(train_X)

    train_X_new = pca.transform(train_X)
    valid_X_new = pca.transform(valid_X)
    selected_features.append(train_X_new.shape[1])

    svm_new = SVC(kernel='linear')
    svm_new.fit(train_X_new, train_y)

    pred_y = svm_new.predict(valid_X_new)
    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'components': components, 'selected features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,components,selected features,accuracy
0,0.975,390,0.946667


Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.9506 => new features = 500, 
(ii) PCA => accuracy = 0.9466 => new features = 390, 

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 4

In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

feature_selector = SelectKBest(score_func=f_classif, k=500)
feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [17]:
from sklearn.decomposition import PCA

pca = PCA(0.975)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [18]:
train_X_new.shape

(28520, 390)

Hyper-parameter Optimization using Randomized Search

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC()
param_distr = {'C':[0.1,1,10], 'kernel':['poly', 'linear']}

randomized_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distr, n_iter=6, cv=3)
randomized_search.fit(train_X_new, train_y)

In [20]:
randomized_search.best_params_

{'kernel': 'poly', 'C': 10}

In [21]:
best_svm_model = randomized_search.best_estimator_

Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_svm_model, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Cross-Validation Scores: [0.96406031 0.96528752 0.96809257 0.96423562 0.9673913 ]
Mean Accuracy = 0.9658134642356242
Standard Deviation = 0.0016446058063896989


Predicting label 4 values for test data set

In [23]:
pred_test_y = best_svm_model.predict(test_X_new)
pred_test_y.shape

(744,)

In [24]:
test_X_new.shape

(744, 390)

In [25]:
rows, columns = test_X_new.shape

new_columns = {
    'label_4' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_4_new.csv'
data_frame.to_csv(csv_file_path, index=False)

Combining all the labels into one csv file

In [26]:
label_1 = pd.read_csv('test_label_1_new.csv')
label_2 = pd.read_csv('test_label_2_new.csv')
label_3 = pd.read_csv('test_label_3_new.csv')
label_4 = pd.read_csv('test_label_4_new.csv')

rows = label_1.shape[0]
columns = 5

table = {
    'ID': (str(i) for i in range(1, rows+1)),
    'label_1': label_1['label_1'],
    'label_2': label_2['label_2'],
    'label_3': label_3['label_3'],
    'label_4': label_4['label_4']
}

labels = pd.DataFrame(table)
csv_file_path = 'layer_10_190297X_new.csv'
labels.to_csv(csv_file_path, index=False)

In [18]:
layer_10 = pd.read_csv(csv_file_path)
layer_10.shape

(744, 5)