In [None]:
# import sys
# sys.path.append("..")
# from common_utils import DATA_HOME
import pandas as pd
import sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, classification_report, confusion_matrix
from collections import Counter
# dataset = "playground-series-s4e2"


In [None]:
# DATA_HOME
train_data = pd.read_csv("/Users/anyxling/datasets/playground-series-s4e2/train.csv", index_col=0)
print(train_data.describe())
train_data.head()
set(train_data["CALC"])

In [None]:
test_data = pd.read_csv('/Users/anyxling/datasets/playground-series-s4e2/test.csv', index_col=0)
print(test_data.describe())
test_data.head()
test_data[test_data["CALC"]=="Always"]

In [None]:
# split x, y in the train dataset
X_train_data, y_train_data = train_data.iloc[:, :-1], train_data.iloc[:, -1]

# encode categorical variables for the features in train dataset 
str_cols_X_train = X_train_data.select_dtypes(include=['object']).columns
le_features = LabelEncoder()
for col in str_cols_X_train:
    X_train_data[col] = le_features.fit_transform(X_train_data[col])
    print(col, dict(zip(le_features.classes_, range(len(le_features.classes_)))))
X_train_data.head()

In [None]:
# encode the labels in train data
le_labels = LabelEncoder()
y_train_data_encoded = le_labels.fit_transform(y_train_data)
y_train_data_encoded.shape, X_train_data.shape
y_train_data

In [None]:
# encode categorical variables for the test data 
str_cols_test = test_data.select_dtypes(include=['object']).columns
for col in str_cols_test:
    test_data[col] = le_features.fit_transform(test_data[col])
    print(col, dict(zip(le_features.classes_, range(len(le_features.classes_)))))
test_data.head()

In [None]:
# scale the features and split it into train, validation, and test dataset
sc = StandardScaler()
X_train_data_scaled = sc.fit_transform(X_train_data)

X_train, X_val_test, y_train, y_val_test = train_test_split(X_train_data_scaled, y_train_data_encoded, test_size=0.1, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
X_train.shape, X_val.shape, X_test.shape
pd.DataFrame(X_train_data_scaled).describe()

In [None]:
# check if the dataset is imbalanced or not
train_class_distribution = Counter(y_train_data_encoded)
print(train_class_distribution)

In [None]:
# use grid search to try out different combinations of parameters 
param_grid = {
    'C': [1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 1, 0.1]
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid, scoring=scoring, refit='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# get the best combination of parameters and its score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
# apply on the validation set 
best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X_val, y_val)
print("Validation accuracy:", val_accuracy)

In [None]:
# compare the validation accuracy with the baseline model
dummy_clf = DummyClassifier(strategy='most_frequent') # naive classifier, majority vote
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
print("accuracy score for dummy classifier:", accuracy_dummy)

In [None]:
# since the performance's good, apply on the test set
test_accuracy = best_model.score(X_test, y_test)
test_pred = best_model.predict(X_test)
print("test accuracy:", test_accuracy)
print(classification_report(y_test, test_pred))
confusion_matrix(y_test, test_pred)

In [None]:
test_data_scaled = sc.fit_transform(test_data)
test_pred = best_model.predict(test_data_scaled)


In [None]:
decoded_pred = le_labels.inverse_transform(test_pred)
decoded_pred

In [None]:
sub = pd.DataFrame({
    "id": test_data['id'],
    "NObeyesdad": decoded_pred
})
sub

In [None]:
sub.to_csv('submission.csv', index=False)