In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('layer_10_train.csv')
valid = pd.read_csv('layer_10_valid.csv')
test = pd.read_csv('layer_10_test.csv')

In [4]:
train_X = train.copy()
train_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
train_y = train['label_3']

valid_X = valid.copy()
valid_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
valid_y = valid['label_3']

test_X = test.copy()
test_X.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
test_y = test['label_3']

In [4]:
train_X.shape

(28520, 768)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
valid_X = scaler.fit_transform(valid_X)
test_X = scaler.fit_transform(test_X)

Feature Engineering

In [6]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components = 0.98,
    svd_solver='full',
    whiten=True
    )
pca.fit(train_X)

train_X_new = pca.transform(train_X)
valid_X_new = pca.transform(valid_X)
test_X_new = pca.transform(test_X)

In [7]:
train_X_new.shape

(28520, 426)

Ensemble Learning with Bagging

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

base_svm = SVC(
    kernel='linear',
    C=50,
    gamma='auto',
)

bagged_svm = BaggingClassifier(
    base_svm, 
    n_estimators=10, 
    random_state=10
)

bagged_svm.fit(train_X_new, train_y)

pred_y = bagged_svm.predict(valid_X_new)

accuracy_score_valid = accuracy_score(valid_y, pred_y)
accuracy_score_valid

0.992

Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold

# number of splits for k-fold cross-validation
num_splits = 5

kf = KFold(
    n_splits=num_splits, 
    shuffle=True, 
    random_state=42
)

cross_val_scores = cross_val_score(bagged_svm, train_X_new, train_y, cv=kf, scoring='accuracy')

print("Cross-Validation Scores :", cross_val_scores)
print("Mean Accuracy =",  np.mean(cross_val_scores))
print("Standard Deviation =",  np.std(cross_val_scores))

In [8]:
pred_test_y = bagged_svm.predict(test_X_new)

accuracy_score_valid = accuracy_score(test_y, pred_test_y)
accuracy_score_valid

0.9933333333333333

In [9]:
rows, columns = test_X_new.shape

new_columns = {
    'label_1' : pred_test_y
}

data_frame = pd.DataFrame(new_columns)

csv_file_path = 'test_label_3_final.csv'
data_frame.to_csv(csv_file_path, index=False)