In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [3]:
print(train.shape)
print(train[train['label_2'].notna()].shape)

train = pd.DataFrame(train)
columns_with_missing_values = train.columns[train.isnull().any()]
print(columns_with_missing_values)

(28520, 772)
(28040, 772)
Index(['label_2'], dtype='object')


In [4]:
print(valid.shape)
print(valid[valid['label_2'].notna()].shape)

valid = pd.DataFrame(valid)

columns_with_missing_values = valid.columns[valid.isnull().any()]
print(columns_with_missing_values)

(750, 772)
(736, 772)
Index(['label_2'], dtype='object')


So, there ARE MISSING VALUES in both train and valid data set for label 2

Handling missing values by deletion

In [3]:
train_X = train[train['label_2'].notna()]                                              # NaN values are dropped
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train[train['label_2'].notna()]['label_2']

valid_X = valid[valid['label_2'].notna()]                                             # NaN values are dropped
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid[valid['label_2'].notna()]['label_2']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)


Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

Model training before feature engineering

Training the model using SVM

In [9]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_X, train_y)

pred_y = svc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9375

Training the model using XGB

In [10]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.8777173913043478

Accuracy scores before feature engineering,

SVM => 0.9375, 
XGB => 0.8777

therefore before feature engineering,
SVM > XGB

Feature Engineering and Model training

(i) Correlation-Based Feature Selection (CFS)

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

selected_features = [500]
accuracies = []

for selected_feature in selected_features:
    feature_selector = SelectKBest(score_func=f_classif, k=selected_feature)

    feature_selector.fit(train_X, train_y)

    train_X_new = feature_selector.transform(train_X)
    valid_X_new = feature_selector.transform(valid_X)
    test_X_new = feature_selector.transform(test_X)

    svm_new = SVC(kernel='linear')
    svm_new.fit(train_X_new, train_y)

    pred_y = svm_new.predict(valid_X_new)

    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)

results = pd.DataFrame({'selected_features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,selected_features,accuracy
0,500,0.828804


(ii) Principal Component Analysis (PCA)

In [19]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

components = [0.99]
selected_features = []
accuracies = []

for component in components:
    pca = PCA(component)
    pca.fit(train_X)

    train_X_new = pca.transform(train_X)
    valid_X_new = pca.transform(valid_X)
    test_X_new = pca.transform(test_X)
    selected_features.append(train_X_new.shape[1])

    svm_new = SVC(kernel='linear')
    svm_new.fit(train_X_new, train_y)

    pred_y = svm_new.predict(valid_X_new)
    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'components': components, 'selected features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,components,selected features,accuracy
0,0.99,528,0.830163


Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.8288 => new features = 500, 
(ii) PCA => accuracy = 0.8302 => new features = 528

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 3

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selected_features = 500

feature_selector = SelectKBest(score_func=f_classif, k=selected_features)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)


In [19]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA(0.99)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [20]:
train_X_new.shape

(28040, 528)

Hyper-parameter Optimization using Randomized Search

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC()
param_distr = {'C':[0.1,1,10], 'kernel':['poly', 'linear']}

randomized_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distr, n_iter=6, cv=3)
randomized_search.fit(train_X_new, train_y)

In [22]:
randomized_search.best_params_

{'kernel': 'poly', 'C': 10}

In [23]:
best_svm_model = randomized_search.best_estimator_

Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_svm_model, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Cross-Validation Scores: [0.94062054 0.94222539 0.94240371 0.93794579 0.93919401]
Mean Accuracy = 0.9404778887303852
Standard Deviation = 0.0017229489770586508


Predicting label 4 values for test data set

In [24]:
pred_test_y = best_svm_model.predict(test_X_new)
pred_test_y.shape

(744,)

In [25]:
test_X_new.shape

(744, 528)

In [26]:
rows, columns = test_X_new.shape

new_columns = {
    'label_2' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_2_new.csv'
data_frame.to_csv(csv_file_path, index=False)