In [None]:
# Step 1: Data Pre-processing
import pandas as pd
import numpy as np

train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project_Phase_1/train.csv')
valid_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project_Phase_1/valid.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project_Phase_1/test.csv')

In [None]:
label_columns = ['label_1', 'label_2', 'label_3', 'label_4']
df_train_X = train_data.drop(label_columns, axis=1)
df_train_y = train_data['label_1']
df_valid_X = valid_data.drop(label_columns, axis=1)
df_valid_y = valid_data['label_1']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_train_X_scaled = scaler.fit_transform(df_train_X)
df_valid_X_scaled = scaler.transform(df_valid_X)

print(df_train_X_scaled.shape)

(28520, 768)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_initial = KNeighborsClassifier(n_neighbors=5)
model_initial.fit(df_train_X_scaled, df_train_y)

y_pred = model_initial.predict(df_valid_X_scaled)

from sklearn.metrics import accuracy_score

accuracy_intial = accuracy_score(df_valid_y, y_pred)

print("Accuracy:", accuracy_intial)

Accuracy: 0.864


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

k_best = SelectKBest(score_func=f_classif, k=200)  # Adjust the value of k as needed
df_train_X_selected = k_best.fit_transform(df_train_X_scaled, df_train_y)
df_valid_X_selected = k_best.transform(df_valid_X_scaled)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_after = KNeighborsClassifier(n_neighbors=5)
model_after.fit(df_train_X_selected, df_train_y)

y_pred = model_after.predict(df_valid_X_selected)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df_valid_y, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.896


In [None]:
from sklearn.decomposition import PCA

pca = PCA(0.985)
pca.fit(df_train_X_selected)
pca_train_X = pca.transform(df_train_X_selected)
pca_valid_X = pca.transform(df_valid_X_selected)

print(pca_train_X.shape)

(28520, 165)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_after = KNeighborsClassifier(n_neighbors=5)
model_after.fit(pca_train_X, df_train_y)
y_pred_before = model_after.predict(pca_valid_X)

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(df_valid_y, y_pred_before)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.8906666666666667


In [86]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5],  # Adjust the list of values to test
    'weights': ['uniform', 'distance'],  # You can include other hyperparameters
    'metric': ['euclidean', 'manhattan']  # Specify distance metrics
}

grid_search = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit the grid search to your data
grid_search.fit(pca_train_X, df_train_y)

In [87]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

In [88]:
best_params

{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}

In [89]:
y_pred_tuned = best_estimator.predict(pca_valid_X)
accuracy_tuned = accuracy_score(df_valid_y, y_pred_tuned)
print("Tuned Accuracy:", accuracy_tuned)

Tuned Accuracy: 0.8933333333333333


In [91]:
# Drop the "ID" column from the test data
test_data = test_data.drop("ID", axis=1)

# Scale the test data using the same scaler that was fitted on the training data
df_test_X_scaled = scaler.transform(test_data)

# Select the same features using SelectKBest
df_test_X_selected = k_best.transform(df_test_X_scaled)

# Transform the test data using the same PCA model
pca_test_X = pca.transform(df_test_X_selected)

In [92]:
# Use the best-tuned model for predictions on the test data
y_pred_test = best_estimator.predict(pca_test_X)

In [94]:
# Assuming 'y_pred_test' is a numpy array or a list
predictions_df = pd.DataFrame({'Predictions': y_pred_test})

predictions_df.to_csv('predictions1_layer9.csv', index=False)