In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

column_names = ["age", "workclass", "final_weight", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "cap-gain", "cap-loss", "hours-per-week", 
                "native-country", "smaller/bigger"]

train_data = pd.read_csv("train.txt", header=None, names=column_names, na_values=" ?")
test_data = pd.read_csv("test.txt", header=None, names=column_names[:-1], na_values=" ?")

original_test_data = test_data.copy()

for column in train_data.columns:
    if train_data[column].dtype == "object":
        train_data[column] = train_data[column].fillna("?")
    else:
        train_data[column] = train_data[column].fillna(train_data[column].median())

train_data["smaller/bigger"] = train_data["smaller/bigger"].apply(lambda x: True if x.strip() == ">N" else False)

label_encoders = {}
for column in train_data.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

X_train = train_data.drop("smaller/bigger", axis=1)
y_train = train_data["smaller/bigger"]

model = CatBoostClassifier(random_state=42)
model.fit(X_train, y_train)

for column in test_data.columns:
    if test_data[column].dtype == "object":
        test_data[column] = test_data[column].fillna("?")
    else:
        test_data[column] = test_data[column].fillna(test_data[column].median()) 
for column, le in label_encoders.items():
    test_data[column] = test_data[column].apply(lambda x: x if x in le.classes_ else "UNKNOWN")
    le.classes_ = np.append(le.classes_, "UNKNOWN")
    test_data[column] = le.transform(test_data[column])

y_pred = model.predict(test_data)

y_pred = np.where(y_pred, 'bigger', 'smaller')

report = classification_report(y_train, model.predict(X_train))
accuracy = accuracy_score(y_train, model.predict(X_train))

print(f"Classification Report:\n{report}")
print(f"Accuracy: {accuracy}")

with open("results.html", "w") as f:
    f.write(f"<h1>Classification Report</h1><pre>{report}</pre>")
    f.write(f"<h1>Accuracy</h1><p>{accuracy}</p>")


original_test_data["predicted_smaller/bigger"] = y_pred

original_test_data.to_html("test_predictions.html", index=False)


Learning rate set to 0.045596
0:	learn: 0.6513095	total: 142ms	remaining: 2m 21s
1:	learn: 0.6116545	total: 152ms	remaining: 1m 15s
2:	learn: 0.5785679	total: 161ms	remaining: 53.5s
3:	learn: 0.5481180	total: 170ms	remaining: 42.4s
4:	learn: 0.5254011	total: 181ms	remaining: 36.1s
5:	learn: 0.5021812	total: 193ms	remaining: 31.9s
6:	learn: 0.4836906	total: 204ms	remaining: 28.9s
7:	learn: 0.4651332	total: 214ms	remaining: 26.6s
8:	learn: 0.4522535	total: 225ms	remaining: 24.8s
9:	learn: 0.4386282	total: 235ms	remaining: 23.3s
10:	learn: 0.4283670	total: 245ms	remaining: 22s
11:	learn: 0.4162995	total: 254ms	remaining: 20.9s
12:	learn: 0.4073602	total: 264ms	remaining: 20.1s
13:	learn: 0.3999978	total: 274ms	remaining: 19.3s
14:	learn: 0.3931831	total: 284ms	remaining: 18.6s
15:	learn: 0.3882777	total: 293ms	remaining: 18s
16:	learn: 0.3837721	total: 303ms	remaining: 17.5s
17:	learn: 0.3784673	total: 314ms	remaining: 17.1s
18:	learn: 0.3740088	total: 323ms	remaining: 16.7s
19:	learn: 0.