In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

column_names = ["age", "workclass", "final_weight", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "cap-gain", "cap-loss", "hours-per-week", 
                "native-country", "smaller/bigger"]

train_data = pd.read_csv("train.txt", header=None, names=column_names, na_values=" ?")
test_data = pd.read_csv("test.txt", header=None, names=column_names[:-1], na_values=" ?")

original_test_data = test_data.copy()

for column in train_data.columns:
    if train_data[column].dtype == "object":
        train_data[column] = train_data[column].fillna("?")
    else:
        train_data[column] = train_data[column].fillna(train_data[column].median())

train_data["smaller/bigger"] = train_data["smaller/bigger"].apply(lambda x: True if x.strip() == ">N" else False)

label_encoders = {}
for column in train_data.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

X_train = train_data.drop("smaller/bigger", axis=1)
y_train = train_data["smaller/bigger"]

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

for column in test_data.columns:
    if test_data[column].dtype == "object":
        test_data[column] = test_data[column].fillna("?")
    else:
        test_data[column] = test_data[column].fillna(test_data[column].median())

for column, le in label_encoders.items():
    test_data[column] = test_data[column].apply(lambda x: x if x in le.classes_ else "UNKNOWN")
    le.classes_ = np.append(le.classes_, "UNKNOWN")
    test_data[column] = le.transform(test_data[column])

y_pred = model.predict(test_data)

y_pred = np.where(y_pred, 'bigger', 'smaller')

report = classification_report(y_train, model.predict(X_train))
accuracy = accuracy_score(y_train, model.predict(X_train))

print(f"Classification Report:\n{report}")
print(f"Accuracy: {accuracy}")

with open("results.html", "w") as f:
    f.write(f"<h1>Classification Report</h1><pre>{report}</pre>")
    f.write(f"<h1>Accuracy</h1><p>{accuracy}</p>")

original_test_data["predicted_smaller/bigger"] = y_pred

original_test_data.to_html("test_predictions.html", index=False)

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     24722
        True       1.00      1.00      1.00      7851

    accuracy                           1.00     32573
   macro avg       1.00      1.00      1.00     32573
weighted avg       1.00      1.00      1.00     32573

Accuracy: 0.9999692997267675
