In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
labels = df["NObeyesdad"]
features = df.drop(["NObeyesdad"], axis=1)
features = features.drop(["id"], axis=1)

In [5]:
categorical_columns = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS",
]


class OneHotEncoder():
    def __init__(self, column_name):
        self.column_name = column_name
        self.values = []

    def fit(self, data):
        self.values = data[self.column_name].unique()

    def transform(self, data):
        for value in self.values:
            data[self.column_name + "_" + value] = data[self.column_name] == value
        data = data.drop([self.column_name], axis=1)
        return data
    
one_hot_encoders = []

for column in categorical_columns:
    encoder = OneHotEncoder(column)
    encoder.fit(features)
    features = encoder.transform(features)
    one_hot_encoders.append(encoder)


In [28]:
labels = pd.get_dummies(labels, drop_first=False)

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

In [30]:
X_train = X_train.to_numpy(dtype=np.float32)
X_test = X_test.to_numpy(dtype=np.float32)
y_train = y_train.to_numpy(dtype=np.float32)
y_val = y_val.to_numpy(dtype=np.float32)

In [31]:
# Normalize the data

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a random forest model
model = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=0)
model.fit(X_train, y_train)


In [32]:
model.score(X_test, y_val)

0.802504816955684

In [34]:
test_data = pd.read_csv('test.csv')

ids = test_data["id"]
try:
    test_data = test_data.drop(["id"], axis=1)
except:
    pass

for encoder in one_hot_encoders:
    test_data = encoder.transform(test_data)

test_data = test_data.to_numpy(dtype=np.float32)
test_data = scaler.transform(test_data)
outputs = model.predict(test_data)

label_values = df["NObeyesdad"].unique()

output_indices = np.argmax(outputs, axis=1)
output_labels = [label_values[i] for i in output_indices]

In [35]:
submission = pd.DataFrame({"id": ids, "NObeyesdad": output_labels})

submission.to_csv("submission.csv", index=False)