In [None]:
# import sys
# sys.path.append("..")
# from common_utils import DATA_HOME
import pandas as pd
import sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    classification_report,
    confusion_matrix,
)
from collections import Counter
import xgboost as xgb

# dataset = "playground-series-s4e2"

In [None]:
# DATA_HOME
train_data = pd.read_csv(
    "/Users/anyxling/datasets/playground-series-s4e2/train.csv", index_col=0
)
# print(train_data.describe())
train_data.shape
# set(train_data["CALC"])

In [None]:
train_data.head()

In [None]:
# get the number of classes
train_data.iloc[:, -1].nunique()

In [None]:
test_data = pd.read_csv(
    "/Users/anyxling/datasets/playground-series-s4e2/test.csv", index_col=0
)
# print(test_data.describe())
test_data.shape
# test_data[test_data["CALC"]=="Always"]

In [None]:
train_test = pd.concat([train_data, test_data])
train_test.dtypes

In [None]:
train_test.describe()

In [None]:
# check outliers
outliers = []
for col in train_test.select_dtypes(include="float").columns:
    col_mean = train_test[col].mean()
    col_std = train_test[col].std()
    col_outliers = train_test[
        (train_test[col] > col_mean + 3 * col_std)
        | (train_test[col] < col_mean - 3 * col_std)
    ]
    for idx in col_outliers.index:
        outliers.append((col, idx))

print(outliers)

In [None]:
# handle outliers
for outlier in outliers:
    col, idx = outlier
    val = train_test[col][idx]
    col_mean = train_test[col].mean()
    col_std = train_test[col].std()
    if val > col_mean + 3 * col_std:
        train_test[col][idx] = col_mean + 3 * col_std
    if val < col_mean - 3 * col_std:
        train_test[col][idx] = col_mean - 3 * col_std

In [None]:
train_test.describe()

In [None]:
# scale the data
sc = StandardScaler()
num_cols = train_test.select_dtypes(include="float").columns
train_test[num_cols] = sc.fit_transform(train_test[num_cols])
train_test.describe()

In [None]:
# encode categorical features
str_cols = train_test.iloc[:, :-1].select_dtypes(include=["object"]).columns
le_features = LabelEncoder()
for col in str_cols:
    train_test[col] = le_features.fit_transform(train_test[col])
    print(col, dict(zip(le_features.classes_, range(len(le_features.classes_)))))

In [None]:
# split train and test data
test_encoded = train_test[pd.isna(train_test["NObeyesdad"])]
train_encoded = train_test[pd.notna(train_test["NObeyesdad"])]

# encode labels in train data
le_labels = LabelEncoder()
y_train_encoded = le_labels.fit_transform(train_encoded.iloc[:, -1])
print(len(y_train_encoded))
# train_encoded.iloc[:, -1] = le_labels.fit_transform(train_encoded.iloc[:, -1])
# train_encoded.iloc[:, -1] = train_encoded.iloc[:, -1].astype(int)
# print(train_encoded.iloc[:, -1].dtype)

In [None]:
# check if train data is imbalanced or not
label_dist = Counter(y_train_encoded)
print(label_dist)

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
    train_encoded.iloc[:, :-1], y_train_encoded, test_size=0.2, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test, test_size=0.5, random_state=42
)
X_train.shape, X_val.shape, X_test.shape

In [None]:
clf = xgb.XGBClassifier(objective="multi:softmax", num_class=7, eval_metric="mlogloss")
clf.fit(
    X_train, y_train, verbose=True, eval_set=[(X_val, y_val)], early_stopping_rounds=10
)

In [None]:
pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print("accuracy score for test data:", accuracy)

In [None]:
# compare the validation accuracy with the baseline model
dummy_clf = DummyClassifier(strategy="most_frequent")  # naive classifier, majority vote
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
print("accuracy score for dummy classifier:", accuracy_dummy)

In [None]:
y_pred = clf.predict(test_encoded.iloc[:, :-1])

In [None]:
decoded_pred = le_labels.inverse_transform(y_pred)
decoded_pred.shape

In [None]:
test_data_idx = pd.read_csv("/Users/anyxling/datasets/playground-series-s4e2/test.csv")
sub = pd.DataFrame({"id": test_data_idx["id"], "NObeyesdad": decoded_pred})
sub

In [None]:
sub.to_csv("submission.csv", index=False)