In [17]:
import pandas as pd
import numpy as np

In [18]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [28]:
print(train.shape)
print(train[train['label_2'].notna()].shape)

train = pd.DataFrame(train)
columns_with_missing_values = train.columns[train.isnull().any()]
print(columns_with_missing_values)

(28520, 772)
(28040, 772)
Index(['label_2'], dtype='object')


In [29]:
print(valid.shape)
print(valid[valid['label_2'].notna()].shape)

valid = pd.DataFrame(valid)

columns_with_missing_values = valid.columns[valid.isnull().any()]
print(columns_with_missing_values)

(750, 772)
(736, 772)
Index(['label_2'], dtype='object')


So, there ARE MISSING VALUES for 480 data in the train data set for label 2, and 14 data in the valid data set

Handling Missing values using Mean Imputation

In [5]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_2'].fillna(train['label_2'].mean()).round()


valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_2'].fillna(valid['label_2'].mean()).round()

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

Training the model using SVM

In [71]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_X, train_y)

pred_y = svc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8066666666666666

Training the model using KNN

In [50]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(train_X, train_y)

pred_y = knn.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8106666666666666

Training the model using Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_X, train_y)

pred_y = rfc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.764

Training the model using XGB

In [52]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

In [53]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.856

Handling missing values with deletion

In [3]:
train_X = train[train['label_2'].notna()]                                              # NaN values are dropped
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train[train['label_2'].notna()]['label_2']

valid_X = valid[valid['label_2'].notna()]                                             # NaN values are dropped
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid[valid['label_2'].notna()]['label_2']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)


Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

SVM model

In [7]:
from sklearn.svm import SVC

svm = SVC(kernel='poly')
svm.fit(train_X, train_y)

pred_y = svm.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.873641304347826

KNN model

In [76]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(train_X, train_y)

pred_y = knn.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8097826086956522

Random Forest model

In [77]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_X, train_y)

pred_y = rfc.predict(valid_X)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.7391304347826086

XGB model

In [78]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()

train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X, train_y_le)

pred_y = xgb_model.predict(valid_X)
accuracy_score(valid_y_le, pred_y)

0.8573369565217391

Accuracy scores with mean imputation (for missing values) before feature engineering,

SVM => 0.8066
KNN => 0.8106
Random Forest => 0.7640
XGB => 0.856

therefore before feature engineering for mean imputation,
XGB > KNN > SVM > Random Forest


Accuracy scores with deletion (for missing values) before feature engineering,

SVM => 0.8736
KNN => 0.8097
Random Forest => 0.7391
XGB => 0.8573

therefore before feature engineering for missing values,
SVM > XGB > KNN > Random Forest

Feature Engineering and Model training with deletion for missing values

In [8]:
train_X = train[train['label_2'].notna()]                                              # NaN values are dropped
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train[train['label_2'].notna()]['label_2']

valid_X = valid[valid['label_2'].notna()]                                             # NaN values are dropped
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid[valid['label_2'].notna()]['label_2']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)


(i) Correlation-Based Feature Selection (CFS)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

selected_features = 500

feature_selector = SelectKBest(score_func=f_classif, k=selected_features)
feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)

svm = SVC(kernel='poly')
svm.fit(train_X_new, train_y)

pred_y = svm.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8885869565217391

(ii) Variance-based Feature Selection

In [138]:
from sklearn.feature_selection import VarianceThreshold

feature_selector = VarianceThreshold(0.005)

feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [139]:
train_X_new.shape

(28040, 182)

In [140]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.8600543478260869

(iii) Principal Component Analysis (PCA)

In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

pca = PCA(0.97)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.845108695652174

In [11]:
train_X_new.shape

(28040, 307)

Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.8885 => new features = 500, 
(ii) Variance-based Feature Selection => accuracy = 0.8601 => new features = 182, 
(iii) PCA => accuracy = 0.8451 => new features = 307, 

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 2 for deletion of missing values

Feature Engineering and Model training with mean imputation for missing values

In [19]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_2'].fillna(train['label_2'].mean()).round()


valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_2'].fillna(valid['label_2'].mean()).round()

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

(i) Correlation-Based Feature Selection (CFS)

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

selected_features = 500
feature_selector = SelectKBest(score_func=f_classif, k=selected_features)
feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)

le = LabelEncoder()
train_y_le = le.fit_transform(train_y)
valid_y_le = le.fit_transform(valid_y)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(train_X_new, train_y_le)

pred_y = xgb_model.predict(valid_X_new)
accuracy_score_valid = accuracy_score(valid_y_le, pred_y)
accuracy_score_valid

0.86

In [14]:
from sklearn.svm import SVC

svm = SVC(kernel='poly')
svm.fit(train_X_new, train_y)

pred_y = svm.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.8853333333333333

(ii) Variance-based Feature Selection

In [150]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

thresholds = [0.007, 0.008, 0.009]
selected_features = []
accuracies = []

for threshold in thresholds:
    feature_selector = VarianceThreshold(threshold)

    feature_selector.fit(train_X, train_y)

    train_X_new = feature_selector.transform(train_X)
    valid_X_new = feature_selector.transform(valid_X)

    selected_features.append(train_X_new.shape[1])

    svc_new = SVC(kernel='linear')
    svc_new.fit(train_X_new, train_y)

    pred_y = svc_new.predict(valid_X_new)

    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'threshold': thresholds,'selected features': selected_features, 'accuracy': accuracies})
results


Unnamed: 0,selected features,accuracy
0,109,0.616
1,98,0.609333
2,87,0.6


In [152]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

thresholds = [0.004, 0.005, 0.006]
selected_features = []
accuracies = []

for threshold in thresholds:
    feature_selector = VarianceThreshold(threshold)

    feature_selector.fit(train_X, train_y)

    train_X_new = feature_selector.transform(train_X)
    valid_X_new = feature_selector.transform(valid_X)

    selected_features.append(train_X_new.shape[1])

    svc_new = SVC(kernel='linear')
    svc_new.fit(train_X_new, train_y)

    pred_y = svc_new.predict(valid_X_new)

    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'threshold': thresholds,'selected features': selected_features, 'accuracy': accuracies})
results


Unnamed: 0,threshold,selected features,accuracy
0,0.004,301,0.758667
1,0.005,184,0.676
2,0.006,135,0.642667


In [154]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

thresholds = [0.0035, 0.0034, 0.0033, 0.0032, 0.0031, 0.003]
selected_features = []
accuracies = []

for threshold in thresholds:
    feature_selector = VarianceThreshold(threshold)

    feature_selector.fit(train_X, train_y)

    train_X_new = feature_selector.transform(train_X)
    valid_X_new = feature_selector.transform(valid_X)

    selected_features.append(train_X_new.shape[1])

    svc_new = SVC(kernel='linear')
    svc_new.fit(train_X_new, train_y)

    pred_y = svc_new.predict(valid_X_new)

    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'threshold': thresholds,'selected features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,threshold,selected features,accuracy
0,0.0035,419,0.784
1,0.0034,444,0.781333
2,0.0033,471,0.801333
3,0.0032,503,0.793333
4,0.0031,529,0.801333
5,0.003,553,0.809333


(iii) Principal Component Analysis (PCA)

In [155]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

components = [0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.955, 0.96, 0.965, 0.97]
selected_features = []
accuracies = []

for component in components:
    pca = PCA(component)
    pca.fit(train_X)

    train_X_new = pca.transform(train_X)
    valid_X_new = pca.transform(valid_X)
    selected_features.append(train_X_new.shape[1])

    svc_new = SVC(kernel='linear')
    svc_new.fit(train_X_new, train_y)

    pred_y = svc_new.predict(valid_X_new)
    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'components': components, 'selected features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,components,selected features,accuracy
0,0.87,77,0.626667
1,0.88,86,0.624
2,0.89,96,0.633333
3,0.9,108,0.66
4,0.91,122,0.674667
5,0.92,139,0.72
6,0.93,160,0.741333
7,0.94,185,0.744
8,0.95,216,0.756
9,0.955,234,0.756


In [156]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

components = [0.975, 0.98, 0.985, 0.99, 0.995]
selected_features = []
accuracies = []

for component in components:
    pca = PCA(component)
    pca.fit(train_X)

    train_X_new = pca.transform(train_X)
    valid_X_new = pca.transform(valid_X)
    selected_features.append(train_X_new.shape[1])

    svc_new = SVC(kernel='linear')
    svc_new.fit(train_X_new, train_y)

    pred_y = svc_new.predict(valid_X_new)
    accuracy_score_valid = accuracy_score(valid_y, pred_y)
    accuracies.append(accuracy_score_valid)


results = pd.DataFrame({'components': components, 'selected features': selected_features, 'accuracy': accuracies})
results

Unnamed: 0,components,selected features,accuracy
0,0.975,341,0.808
1,0.98,381,0.816
2,0.985,431,0.82
3,0.99,496,0.834667
4,0.995,589,0.822667


Highest accuracies of each feature engineering methods

(i) Correlation-Based Feature Selection (CFS) => accuracy = 0.8853 => new features = 500, 
(ii) Variance-based Feature Selection => accuracy = 0.8013 => new features = 471, 
(iii) PCA => accuracy = 0.8346 => new features = 496, 

therefore Correlation-Based Feature Selection (CFS) looks like the better method for feature engineering for label 2 for mean imputation for missing values

Compared to the accuracy outputs of both "deletion" and "mean imputation", we can come to a conclusion that deletion is better for label 2 missing values.
Therefore we can select the features using Correlation-Based Feature Selection (CFS) according to the outputs.

In [20]:
train_X = train[train['label_2'].notna()]                                              # NaN values are dropped
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train[train['label_2'].notna()]['label_2']

valid_X = valid[valid['label_2'].notna()]                                             # NaN values are dropped
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid[valid['label_2'].notna()]['label_2']

test_X = test.copy()
test_X.drop(['ID'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)


In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selected_features = 500

feature_selector = SelectKBest(score_func=f_classif, k=selected_features)
feature_selector.fit(train_X, train_y)

train_X_new = feature_selector.transform(train_X)
valid_X_new = feature_selector.transform(valid_X)
test_X_new = feature_selector.transform(test_X)

In [21]:
from sklearn.decomposition import PCA

pca = PCA(0.99)
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [22]:
train_X_new.shape

(28040, 495)

Hyper-parameter Optimization using Randomized Search

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC()
param_distr = {'C':[0.1,1,10], 'kernel':['poly', 'linear']}

randomized_search = RandomizedSearchCV(estimator=svm, param_distributions=param_distr, n_iter=5, cv=2)
randomized_search.fit(train_X_new, train_y)

In [24]:
randomized_search.best_params_

{'kernel': 'poly', 'C': 1}

In [25]:
best_svm_model = randomized_search.best_estimator_

Cross Validation

In [29]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_svm_model, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

Cross-Validation Scores: [0.88480742 0.89176177 0.88516405 0.8792796  0.88908702]
Mean Accuracy = 0.8860199714693294
Standard Deviation = 0.0042428881811599145


Predicting label 2 values for test data set

In [26]:
pred_test_y = best_svm_model.predict(test_X_new)
pred_test_y.shape

(744,)

In [27]:
test_X_new.shape

(744, 495)

In [28]:
rows, columns = test_X_new.shape

new_columns = {
    'label_2' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_2_new.csv'
data_frame.to_csv(csv_file_path, index=False)