In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

RANDOM_STATE = 42

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]

In [4]:
print("Training data shape:", X.shape)
print("Test data shape:", test_df.shape)

Training data shape: (3053, 71)
Test data shape: (764, 71)


In [5]:

rf = RandomForestClassifier(random_state=RANDOM_STATE)
knn = KNeighborsClassifier()
svm = SVC(random_state=RANDOM_STATE)

In [6]:

f1_scorer = make_scorer(f1_score)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:

rf_baseline_score = np.mean(cross_val_score(rf, X, y, cv=5, scoring=f1_scorer))
knn_baseline_score = np.mean(cross_val_score(knn, X_scaled, y, cv=5, scoring=f1_scorer))
svm_baseline_score = np.mean(cross_val_score(svm, X_scaled, y, cv=5, scoring=f1_scorer))

print("Baseline Random Forest F1 score:", rf_baseline_score)
print("Baseline k-NN F1 score:", knn_baseline_score)
print("Baseline SVM F1 score:", svm_baseline_score)

Baseline Random Forest F1 score: 0.3537587050630529
Baseline k-NN F1 score: 0.22309402749002594
Baseline SVM F1 score: 0.1665437788018433


In [8]:
#Random Forest Tuning
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='f1', n_jobs=-1)
rf_grid.fit(X, y)
print("Best Random Forest parameters:", rf_grid.best_params_)
print("Best Random Forest F1 score:", rf_grid.best_score_)

Best Random Forest parameters: {'max_depth': None, 'n_estimators': 300}
Best Random Forest F1 score: 0.36828276219134864


In [9]:
#k-NN Tuning (using a pipeline to include scaling)
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])
knn_param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance']
}
knn_grid = GridSearchCV(knn_pipeline, knn_param_grid, cv=5, scoring='f1')
knn_grid.fit(X, y)
print("Best k-NN parameters:", knn_grid.best_params_)
print("Best k-NN F1 score:", knn_grid.best_score_)

Best k-NN parameters: {'knn__n_neighbors': 3, 'knn__weights': 'distance'}
Best k-NN F1 score: 0.34594689009073976


In [10]:
#SVM Tuning (using a pipeline to include scaling)
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=RANDOM_STATE))
])
svm_param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf', 'linear']
}
svm_grid = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='f1')
svm_grid.fit(X, y)
print("Best SVM parameters:", svm_grid.best_params_)
print("Best SVM F1 score:", svm_grid.best_score_)

Best SVM parameters: {'svm__C': 10, 'svm__kernel': 'rbf'}
Best SVM F1 score: 0.4371006341740598


In [11]:

final_model = svm_grid.best_estimator_

In [12]:

final_model.fit(X, y)

In [13]:

X_test = test_df

predictions = final_model.predict(X_test)

In [14]:

with open('predictions.txt', 'w') as f:
    for pred in predictions:
        f.write(str(pred) + '\n')

print("Predictions saved to predictions.txt")

Predictions saved to predictions.txt
