In [21]:
import pandas as pd
import numpy as np

In [22]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [4]:
print(train.shape)
print(train[train['label_3'].notna()].shape)

train = pd.DataFrame(train)
columns_with_missing_values = train.columns[train.isnull().any()]
print(columns_with_missing_values)

(28520, 772)
(28520, 772)
Index(['label_2'], dtype='object')


In [5]:
print(valid.shape)
print(valid[valid['label_3'].notna()].shape)

valid = pd.DataFrame(valid)

columns_with_missing_values = valid.columns[valid.isnull().any()]
print(columns_with_missing_values)

(750, 772)
(750, 772)
Index(['label_2'], dtype='object')


So, there are NO MISSING VALUES in both train and valid data set for label 3

In [23]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_3']

valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_3']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

Training the model using SVM

In [7]:
from sklearn.svm import SVC

svc = SVC(kernel='poly')
svc.fit(train_X, train_y)

pred_y = svc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9946666666666667

Training the model using KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(train_X, train_y)

pred_y = knn.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9786666666666667

Training the model using Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_X, train_y)

pred_y = rfc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9666666666666667

Training the model using XGB

In [8]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.996

Accuracy scores before feature engineering,

SVM => 0.9946
KNN => 0.9786
Random Forest => 0.9666
XGB => 0.9960

therefore before feature engineering,
XGB > SVM > KNN > Random Forest

Feature Engineering and Model training

(i) Correlation-Based Feature Selection (CFS)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

selected_features = 500

feature_selector = SelectKBest(score_func=f_classif, k=selected_features)
feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.996

In [10]:
from sklearn.svm import SVC

svc = SVC(kernel='poly')
svc.fit(train_X_new, train_y)

pred_y = svc.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9946666666666667

(ii) Variance-based Feature Selection

In [98]:
from sklearn.feature_selection import VarianceThreshold

feature_selector = VarianceThreshold(1.0)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [99]:
train_X_new.shape

(28520, 239)

In [100]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.9866666666666667

(iii) Principal Component Analysis (PCA)

In [12]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

pca = PCA(0.97)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
    
le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.9946666666666667

In [13]:
from sklearn.svm import SVC

svc = SVC(kernel='poly')
svc.fit(train_X_new, train_y)

pred_y = svc.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.9946666666666667

In [14]:
train_X_new.shape

(28520, 401)

Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.9960 => new features = 500, 
(ii) Variance-based Feature Selection => accuracy = 0.9866 => new features = 239, 
(iii) PCA => accuracy = 0.9946 => new features = 401, 

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 3

In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

num_features_to_select = 500
feature_selector = SelectKBest(score_func=f_classif, k=num_features_to_select)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [26]:
from sklearn.decomposition import PCA

pca = PCA(0.97)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [27]:
train_X_new.shape

(28520, 401)

Hyper-parameter Optimization using Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()

param_distr = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3]
}

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_distr, n_iter=5)
random_search.fit(train_X_new, train_y_le)

In [12]:
random_search.best_params_

{'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.2}

In [13]:
best_xgb_model = random_search.best_estimator_

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC()
param_distr = {'C':[0.1,1,10], 'kernel':['poly', 'linear']}

randomized_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distr, n_iter=5, cv=2)
randomized_search.fit(train_X_new, train_y)

In [29]:
randomized_search.best_params_

{'kernel': 'linear', 'C': 0.1}

In [30]:
best_svm_model = randomized_search.best_estimator_

Cross Validation for xgb model

In [19]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_xgb_model, train_X_new, train_y_le, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)

Cross-Validation Scores: [0.98685133 0.9865007  0.98597475 0.98527349 0.98562412]


In [20]:
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Mean Accuracy = 0.9860448807854139
Standard Deviation = 0.000571862076798742


Cross Validation for svm model

In [33]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_svm_model, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores :", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Cross-Validation Scores : [0.99456522 0.99193548 0.99246143 0.99316269 0.99509116]
Mean Accuracy = 0.9934431977559608
Standard Deviation = 0.0012075158034868484


Predicting label 3 values for test data set using xgb model

In [14]:
pred_test_y = best_xgb_model.predict(test_X_new)
pred_test_y.shape

(744,)

In [15]:
test_X_new.shape

(744, 401)

Predicting label 3 values for test data set using svm model

In [31]:
pred_test_y = best_svm_model.predict(test_X_new)
print(pred_test_y.shape)
print(test_X_new.shape)

(744,)
(744, 401)


In [32]:
rows, columns = test_X_new.shape

new_columns = {
    'label_3' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_3_new.csv'
data_frame.to_csv(csv_file_path, index=False)