In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [4]:
train_dataset = pd.read_csv('train.csv')
valid_dataset = pd.read_csv('valid.csv')
test_dataset = pd.read_csv('test.csv')

In [5]:
from catboost import CatBoostClassifier, Pool

# Label 02

In [6]:
label_2_train_ori = train_dataset.drop(columns=['label_1','label_3','label_4'])
label_2_valid_ori = valid_dataset.drop(columns=['label_1','label_3','label_4'])

## Handle Missing values

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(label_2_train_ori.isnull().sum())

There's 480 values missing in label_2. Since there's enough data to work with I'll drop the missing values

In [7]:
label_2_train_cleaned = label_2_train_ori.dropna()
label_2_valid_cleaned = label_2_valid_ori.dropna()

## Without Feature Engineering or Hyperparameter tuning

In [8]:
X_train_label_2_ini = label_2_train_cleaned.drop(columns=['label_2'])
y_train_label_2_ini = label_2_train_cleaned['label_2']
X_valid_label_2_ini = label_2_valid_cleaned.drop(columns=['label_2'])
y_valid_label_2_ini = label_2_valid_cleaned['label_2']

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_label_2_ini)
X_valid_scaled = scaler.transform(X_valid_label_2_ini)

In [16]:
model = SVC()
model.fit(X_train_scaled,y_train_label_2_ini)
y_pred_label_2 = model.predict(X_valid_scaled)
accuracy = accuracy_score(y_valid_label_2_ini, y_pred_label_2)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 91.85%


In [None]:
label_2_data_to_predict = test_dataset.drop(columns=['ID'])
PCA_analysis_df_test_scaled_label_2 = scaler.transform(label_2_data_to_predict)

final_label_2_predict = model.predict(PCA_analysis_df_test_scaled_label_2)
final_submission = "190601D_submission1.csv"
dataframe = pd.read_csv(final_submission)
dataframe['label_2'] = final_label_2_predict
dataframe.to_csv(final_submission,index=False)

## Feature selection using K-best

In [None]:
k_best = SelectKBest(score_func=f_classif, k=400)
X_train_selected = k_best.fit_transform(X_train_label_2_ini, y_train_label_2_ini)
X_valid_selected = k_best.transform(X_valid_scaled)

## Feature selection using PCA

In [10]:
pca = PCA(n_components=0.97,svd_solver='full')
principal_components_label_2 = pca.fit_transform(X_train_scaled)
valid_principal_components_label_2 = pca.transform(X_valid_scaled)

In [11]:
principal_df_label_2 = pd.DataFrame(data=principal_components_label_2, columns=[f'new_feature_{i}' for i in range(principal_components_label_2.shape[1])])
valid_principal_df_label_2 = pd.DataFrame(data = valid_principal_components_label_2,columns=[f'new_feature_{i}' for i in range(valid_principal_components_label_2.shape[1])])

In [None]:
model_2 = SVC()
model_2.fit(principal_df_label_2,y_train_label_2_ini)
y_pred = model_2.predict(valid_principal_df_label_2)
accuracy = accuracy_score(y_valid_label_2_ini, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [17]:
label_2_data_to_predict = test_dataset.drop(columns=['ID'])
PCA_analysis_df_test_scaled_label_2 = scaler.transform(label_2_data_to_predict)
test_principal_components_label_2 = pca.transform(PCA_analysis_df_test_scaled_label_2)
test_principal_df = pd.DataFrame(data = test_principal_components_label_2,columns=[f'new_feature_{i}' for i in range(test_principal_components_label_2.shape[1])])

final_label_2_predict = model_2.predict(test_principal_df)

NameError: name 'model_2' is not defined

In [None]:
final_submission = "190601D_submission1.csv"
dataframe = pd.read_csv(final_submission)
dataframe['label_2'] = final_label_2_predict
dataframe.to_csv(final_submission,index=False)

In [12]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
                'C': [1, 10, 20, 30, 40, 50, 100],
                'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                'gamma': ['scale', 'auto']
                }

base_estimator = SVC(gamma='scale', kernel='rbf', random_state=42)
search = HalvingGridSearchCV(base_estimator, param_grid, cv=5, verbose=1, n_jobs=7)
search.fit(principal_df_label_2, y_train_label_2_ini)

print(search.best_params_)
print(search.best_score_)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 1038
max_resources_: 28040
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 56
n_resources: 1038
Fitting 5 folds for each of 56 candidates, totalling 280 fits
----------
iter: 1
n_candidates: 19
n_resources: 3114
Fitting 5 folds for each of 19 candidates, totalling 95 fits
----------
iter: 2
n_candidates: 7
n_resources: 9342
Fitting 5 folds for each of 7 candidates, totalling 35 fits
----------
iter: 3
n_candidates: 3
n_resources: 28026
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.6195182872435325


In [14]:
valid_principal_df_label_2.shape

(736, 401)

In [15]:
y_pred_label_2 = search.best_estimator_.predict(valid_principal_df_label_2)
accuracy_tuned = accuracy_score(y_pred_label_2, y_valid_label_2_ini)
print(f"Accuracy on validation data: {accuracy_tuned * 100:.2f}%")

Accuracy on validation data: 95.11%


## Make predictions & Test dataset

In [18]:
label_2_data_to_predict = test_dataset.drop(columns=['ID'])
PCA_analysis_df_test_scaled_label_2 = scaler.transform(label_2_data_to_predict)
test_principal_components_label_2 = pca.transform(PCA_analysis_df_test_scaled_label_2)
test_principal_df = pd.DataFrame(data = test_principal_components_label_2,columns=[f'new_feature_{i}' for i in range(test_principal_components_label_2.shape[1])])

final_label_2_predict = search.best_estimator_.predict(test_principal_df)

In [19]:
final_submission = "190601D_submission1.csv"
dataframe = pd.read_csv(final_submission)
dataframe['label_2'] = final_label_2_predict
dataframe.to_csv(final_submission,index=False)