# Import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Read CSV

Here the datasets are read from the csv files with the help of pandas library.


In [2]:
# Specify the file paths
train_path = "layer_12_train.csv"
valid_path = "layer_12_valid.csv"
test_path = "layer_12_test.csv"

# Load the CSV files into Pandas DataFrames
train = pd.read_csv(train_path)
valid = pd.read_csv(valid_path)
test = pd.read_csv(test_path)

Labels and features are separated and assigned to the arrays for easy access.

In [3]:
labels = ['label_1','label_2','label_3','label_4']
columns = train.columns
features = [f for f in columns if f not in labels]

In [4]:
train_df = train.copy()
valid_df = valid.copy()
test_df = test.copy()

# Scale the data

Four dictionaries are used to store the scaled data. And store in it based on the labels. 

In [None]:
scaler = RobustScaler()

x_train = {}
y_train = {}
x_valid = {}
y_valid = {}
x_test = {}

for label in labels:
  train_df_copy = train_df
  valid_df_copy = valid_df
  if(label=="label_2"):
    train_df_copy = train_df[train_df[label].notna()]
    valid_df_copy = valid_df[valid_df[label].notna()]
    train_df.head()
  
  x_train[label] = pd.DataFrame(scaler.fit_transform(train_df_copy.drop(columns=labels, axis=1)), columns=features)
  y_train[label] = train_df_copy[label]
  x_valid[label] = pd.DataFrame(scaler.transform(valid_df_copy.drop(columns=labels, axis=1)), columns=features)
  y_valid[label] = valid_df_copy[label]

  x_test[label] = pd.DataFrame(scaler.transform(test_df.drop(columns=labels, axis=1)), columns=features)

# Check Accuracy

After the scaling of the dataset the accuracy is tested

In [None]:
initial_accuracy = {}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='linear').fit(x_train[label], y_train[label])
  y_pred = clf.predict(x_valid[label])
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

print(initial_accuracy)

# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
def PCA_dev(label):
  threshold = 0.95

  pca = PCA( 0.95, svd_solver='full')
  pca_train = pca.fit_transform(x_train[label])
  pca_valid = pca.transform(x_valid[label])
  pca_test = pca.transform(x_test[label])

  return pca_train, pca_valid, pca_test

In [None]:
pca_train = {}
pca_valid = {}
pca_test = {}
for label in labels:
  train, valid, test = PCA_dev(label)
  pca_train[label] = train
  pca_valid[label] = valid
  pca_test[label] = test

# Check Accuracy again

After PCA is applied for dimensionality reduction the accuracy is tested

In [None]:
initial_accuracy = {}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='linear').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

print(initial_accuracy)

# Hyper Parameter Tuning - Manual    

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='poly').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='rbf').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}

for label in ['label_1', 'label_2']:
  print(label)
  clf = svm.SVC(kernel='poly').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  output_df = pd.DataFrame()
  output_df[label] = y_pred
  output_df.to_csv(label+".csv", index=False, header=False)

for label in ['label_3', 'label_4']:
  print(label)
  clf = svm.SVC(kernel='rbf').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  output_df = pd.DataFrame()
  output_df[label] = y_pred
  output_df.to_csv(label+".csv", index=False, header=False)

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}

for label in ['label_1', 'label_2']:
  print(label)
  clf = svm.SVC(kernel='poly', class_weight='balanced', gamma='scale').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

for label in ['label_3', 'label_4']:
  print(label)
  clf = svm.SVC(kernel='rbf', class_weight='balanced', gamma='scale').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
  clf = rf_classifier.fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(initial_accuracy[label])

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42, class_weight='balanced')
  clf = rf_classifier.fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(initial_accuracy[label])

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=42, class_weight='balanced')
  clf = rf_classifier.fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(initial_accuracy[label])

print(initial_accuracy)

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='poly', degree=5, gamma='scale',C= 100, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='rbf', gamma='scale',C= 1, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='rbf', gamma='scale',C= 100, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='rbf', gamma='scale',C= 1000, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='linear', gamma='scale',C= 1, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='linear', gamma='scale',C= 100, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

In [None]:
initial_accuracy = {}
y_predictions={}
for label in labels:
  print(label)
  clf = svm.SVC(kernel='linear', gamma='scale',C= 1000, class_weight='balanced').fit(pca_train[label], y_train[label])
  y_pred = clf.predict(pca_valid[label])
  y_pred_test = clf.predict(pca_test[label])
  y_predictions[label] = y_pred_test
  initial_accuracy[label]= metrics.accuracy_score(y_valid[label], y_pred)
  print(label,initial_accuracy[label])

# Hyper-parameter Tuning

In [11]:
param_dist = {
    'C': np.logspace(-3, 3, 7),               # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf'],     # Kernel type
    'degree': np.arange(1, 6),               # Degree of the polynomial kernel
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),  # Kernel coefficient for 'rbf' and 'poly'
    'class_weight':[ 'balanced']             # Weights associated with classes
}

In [43]:
svc = SVC()

# Label 1

In [None]:
random_search_label_1 = RandomizedSearchCV(
    svc, param_distributions=param_dist, n_iter=5, scoring='accuracy', verbose=3, cv=3, random_state=42, n_jobs=-1
)

# Fit the random search to the data
random_search_label_1.fit(pca_train['label_1'], y_train['label_1'])

best_model_label_1 = random_search_label_1.best_estimator_
y_pred_label_1 = best_model_label_1.predict(pca_test['label_1'])

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_label_1.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search_label_1.best_score_ * 100))

In [None]:
y_pred_val_1 = best_model_label_1.predict(pca_valid['label_1'])

print("Validation Accuracy: {:.2f}%".format(metrics.accuracy_score(y_valid['label_1'],y_pred_val_1)* 100))

# Label 2

In [None]:
random_search_label_2 = RandomizedSearchCV(
    svc, param_distributions=param_dist, n_iter=5, scoring='accuracy', verbose=3, cv=3, random_state=42, n_jobs=-1
)

# Fit the random search to the data
random_search_label_2.fit(pca_train['label_2'], y_train['label_2'])

best_model_label_2 = random_search_label_2.best_estimator_
y_pred_label_2 = best_model_label_2.predict(pca_test['label_2'])

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_label_2.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search_label_2.best_score_ * 100))

In [None]:
y_pred_val_2 = best_model_label_2.predict(pca_valid['label_2'])

print("Validation Accuracy: {:.2f}%".format(metrics.accuracy_score(y_valid['label_2'],y_pred_val_2)* 100))

In [None]:
params_ = {
    'kernel': ['rbf'],     # Kernel type
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01],
}

random_search_label_2 = RandomizedSearchCV(
    svc, param_distributions=params_, n_iter=7, scoring='accuracy', verbose=1, random_state=42, n_jobs=-1
)

# Fit the random search to the data
random_search_label_2.fit(x_train['label_2'], y_train['label_2'])

best_model_label_2 = random_search_label_2.best_estimator_
y_pred_label_2 = best_model_label_2.predict(x_test['label_2'])

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_label_2.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search_label_2.best_score_ * 100))

# Label 3

In [None]:
random_search_label_3 = RandomizedSearchCV(
    svc, param_distributions=param_dist, n_iter=5, scoring='accuracy', verbose=3, cv=3, random_state=42, n_jobs=-1
)

# Fit the random search to the data
random_search_label_3.fit(pca_train['label_3'], y_train['label_3'])

best_model_label_3 = random_search_label_3.best_estimator_
y_pred_label_3 = best_model_label_3.predict(pca_test['label_3'])

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_label_3.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search_label_3.best_score_ * 100))

In [None]:
y_pred_val_3 = best_model_label_3.predict(pca_valid['label_3'])

print("Validation Accuracy: {:.2f}%".format(metrics.accuracy_score(y_valid['label_3'],y_pred_val_3)* 100))

# Label 4

In [None]:
random_search_label_4 = RandomizedSearchCV(
    svc, param_distributions=param_dist, n_iter=5, scoring='accuracy', verbose=3, cv=3, random_state=42, n_jobs=-1
)

# Fit the random search to the data
random_search_label_4.fit(pca_train['label_4'], y_train['label_4'])

best_model_label_4 = random_search_label_4.best_estimator_
y_pred_label_4 = best_model_label_4.predict(pca_test['label_4'])

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_label_4.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search_label_4.best_score_ * 100))

In [None]:
y_pred_val_4 = best_model_label_4.predict(pca_valid['label_4'])

print("Validation Accuracy: {:.2f}%".format(metrics.accuracy_score(y_valid['label_4'],y_pred_val_4)* 100))

# Save data to csv output

In [53]:
output_df = pd.DataFrame()
output_df['ID'] = test_df['ID']
output_df['label_1'] = y_pred_label_1
output_df['label_2'] = y_pred_label_2
output_df['label_3'] = y_pred_label_3
output_df['label_4'] = y_pred_label_4

In [55]:
output_df.to_csv('Output_layer_12.csv', index=False)