In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [3]:
#functions to predict and check accuray

def predictAndGetAccuracyScore(model, x_valid, y_valid):
    y_pred = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy


In [4]:
#functions to execute halving grid search

def getHalvingGrid(model, param_grid, x_train, y_train):
    search = enable_halving_search_cv(
        model, param_grid, cv=5, factor=2, verbose=2, n_jobs=-1)

    search.fit(x_train, y_train)
    print("Best Hyperparameters:", search.best_params_)
    return search


In [5]:
#function to execute cross validation
def crossVal(model, x_train, y_train):
    kf = KFold(n_splits=2, shuffle=True, random_state=42)

    # Perform cross-validation and calculate the mean accuracy
    scores = cross_val_score(model, x_train, y_train, cv=kf)
    mean_accuracy = scores.mean()
    print("Mean Accuracy: {:.2f}%".format(mean_accuracy * 100))
    return mean_accuracy

In [6]:
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('valid.csv')
test_data = pd.read_csv('test.csv')


In [7]:
X_train = train_data.drop(columns=["label_1", "label_2", "label_3", "label_4"])
y_test_train = train_data["label_1"]

X_valid = valid_data.drop(columns=["label_1", "label_2", "label_3", "label_4"])
Y_valid = valid_data["label_1"]

X_test = valid_data.drop(columns=["label_1", "label_2", "label_3", "label_4"])
Y_test = valid_data["label_1"]


In [8]:
#create separate dataframes for each label

x_train = {}
x_valid = {}
y_train = {}
y_valid = {}
x_test = {}
y_test = {}
Labels = ["label_1", "label_2", "label_3", "label_4"]
Features = np.array(train_data.drop(columns=["label_1", "label_2", "label_3", "label_4"]).columns)

for label in Labels:
  tr_data = train_data[train_data["label_2"].notna()] if label == "label_2" else train_data
  vl_data = valid_data[valid_data["label_2"].notna()] if label == "label_2" else valid_data
  
  #standardize data
  scaler = StandardScaler()
  
  x_train[label] = pd.DataFrame(scaler.fit_transform(tr_data.drop(Labels, axis=1)), columns=Features)
  y_train[label] = tr_data[label]
  x_valid[label] = pd.DataFrame(scaler.transform(vl_data.drop(Labels, axis=1)), columns=Features)
  y_valid[label] = vl_data[label]
  x_test[label] = pd.DataFrame(scaler.transform(test_data.drop(["ID"], axis=1)), columns=Features)

<h4>Label 1</h4>

In [8]:
###### Initial model
svm_model_1 = SVC()

Accuracy: 1.87%


In [9]:
#Initial accracy with cross validation
crossVal(svm_model_1, x_train["label_1"], y_train["label_1"])

Mean Accuracy: 86.07%


0.8606591865357645

In [12]:
#Apply PCA -Feature Extraction
pca_label_1 = PCA(n_components=0.99, svd_solver="full")
pca_label_1.fit(x_train["label_1"])
x_train_label_1_PCA = pd.DataFrame(pca_label_1.transform(x_train["label_1"]))
print(x_train_label_1_PCA.shape)
x_valid_label_1_PCA = pd.DataFrame(pca_label_1.transform(x_valid["label_1"]))


(28520, 386)


In [13]:
#Accuracy after PCA with cross validation
crossVal(svm_model_1, x_train_label_1_PCA, y_train["label_1"])


Mean Accuracy: 85.70%


0.8570126227208976

In [9]:
#Applied manual tuning and found best parameters for SVM
#Initialized model with tuned parameters
svm_model_tuned_1 = SVC(kernel="rbf", C=100, gamma=0.001)

In [15]:
#Accuracy after tuning with cross validation without pca
crossVal(svm_model_tuned_1, x_train["label_1"], y_train["label_1"])

Mean Accuracy: 91.77%


0.9177068723702665

In [10]:
#predict test data with tuned model without pca
svm_model_tuned_1.fit(x_train["label_1"], y_train["label_1"])
y_pred_label_1 = svm_model_tuned_1.predict(x_test["label_1"])

In [21]:
result_df = pd.DataFrame({'ID': test_data['ID'], 'label_1': y_pred_label_1})


<h4>Label 2</h4>

In [18]:
#Initial accracy with cross validation
svm_model_2 = SVC()
crossVal(svm_model_2, x_train["label_1"], y_train["label_1"])

In [19]:
#Apply PCA -Feature Extraction
pca_label_2 = PCA(n_components=0.99, svd_solver="full")
pca_label_2.fit(x_train["label_2"])
x_train_label_2_PCA = pd.DataFrame(pca_label_2.transform(x_train["label_2"]))
print(x_train_label_2_PCA.shape)
x_valid_label_2_PCA = pd.DataFrame(pca_label_2.transform(x_valid["label_2"]))

(28040, 385)


In [20]:
#Accuracy after PCA with cross validation
crossVal(svm_model_2, x_train_label_2_PCA, y_train["label_2"])

Mean Accuracy: 74.59%


0.7459343794579172

In [12]:
#Applied manual tuning and found best parameters for SVM
#Initialized model with tuned parameters
svm_model_2_tuned = SVC(kernel="rbf", C=100, gamma=0.001)

#Accuracy after tuning with cross validation without pca
crossVal(svm_model_2_tuned, x_train["label_2"], y_train["label_2"])


Mean Accuracy: 86.73%


0.8672610556348075

In [23]:
#predict test data with tuned model without pca
svm_model_2_tuned.fit(x_train["label_2"], y_train["label_2"])
y_pred_label_2 = svm_model_2_tuned.predict(x_test["label_2"])

In [24]:
result_df['label_2'] = y_pred_label_2


<h4>Label 3</h4>

In [36]:
svm_model_3 = SVC()

In [37]:
#Initial accracy with cross validation

crossVal(svm_model_3, x_train["label_3"], y_train["label_3"])

Mean Accuracy: 98.97%


0.989726507713885

In [29]:
#Apply PCA - Feature Extraction
pca_label_3 = PCA(n_components=0.99, svd_solver="full")
pca_label_3.fit(x_train["label_3"])
x_train_label_3_PCA = pd.DataFrame(pca_label_3.transform(x_train["label_3"]))
print(x_train_label_3_PCA.shape)
x_valid_label_3_PCA = pd.DataFrame(pca_label_3.transform(x_valid["label_3"]))

(28520, 386)


In [30]:
#Accuracy after PCA with cross validation

crossVal(svm_model_3, x_train_label_3_PCA, y_train["label_3"])

Mean Accuracy: 98.96%


0.9895511921458625

In [38]:
#Applied manual tuning and found best parameters for SVM
#Initialed model with tuned parameters
svm_model_3_tuned = SVC(kernel='rbf', C=100, gamma=0.001)
svm_model_3_tuned.fit(x_train["label_3"], y_train["label_3"])

In [39]:
#predict test data with tuned model without pca
y_pred_label_3 = svm_model_3_tuned.predict(x_test["label_3"])

In [40]:
result_df['label_3'] = y_pred_label_3


<h4>Label 4</h4>

In [35]:
###### Initial accuracy with svm_model
svm_model_4 = SVC()


In [None]:
#Initial accracy with cross validation
crossVal(svm_model_4, x_train["label_4"], y_train["label_4"])

In [None]:
#Apply PCA -Feature Extraction
pca_label_4 = PCA(n_components=0.96, svd_solver="full")
pca_label_4.fit(x_train["label_4"])
x_train_label_4_PCA = pd.DataFrame(pca_label_4.transform(x_train["label_4"]))
print(x_train_label_4_PCA.shape)
x_valid_label_4_PCA = pd.DataFrame(pca_label_4.transform(x_valid["label_4"]))

In [None]:
#Accuracy after PCA with cross validation
crossVal(svm_model_3, x_train_label_4_PCA, y_train["label_4"])

In [41]:
#Applied manual tuning and found best parameters for SVM
#Initialed model with tuned parameters
svm_model_4_tuned = SVC(kernel='rbf', C=100, gamma=0.001)
svm_model_4_tuned.fit(x_train["label_4"], y_train["label_4"])


In [42]:
#predict test data with tuned model without pca
y_pred_label_4 = svm_model_4_tuned.predict(x_test["label_4"])


In [43]:
result_df['label_4'] = y_pred_label_4

In [44]:
result_df.to_csv("solutions.csv", index=False)
