<a href="https://colab.research.google.com/github/Su-ok/MT2025124_ML_Project/blob/main/Kag_Obesity_1_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ================================
#  AIT 511 Course Project 1 - Checkpoint 1
#  Multiclass Classification: Obesity Risk
#  Author: <MT2025124>
#  RF
# ================================

# --- Step 1: Imports and settings ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# --- Step 2: Load data ---
train = pd.read_csv("/content/drive/MyDrive/ML kaggle data/train.csv")
test  = pd.read_csv("/content/drive/MyDrive/ML kaggle data/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

# --- Step 3: Basic preprocessing ---
# Drop id column for training, keep for submission
X = train.drop(["WeightCategory", "id"], axis=1)
y = train["WeightCategory"]

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])

# Encode target labels
y_le = LabelEncoder()
y = y_le.fit_transform(y)

# Feature scaling (for KNN / distance-based models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test.drop("id", axis=1))

# --- Step 4: Split train/validation ---
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# --- Step 5: Model training and evaluation ---
model = RandomForestClassifier(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
y_train_pred=model.predict(X_train)
y_val_pred=model.predict(X_val)

train_acc=accuracy_score(y_train, y_train_pred)
val_acc=accuracy_score(y_val, y_val_pred)
print("Training Accuracy:", train_acc)
print("Validation Accuracy:", val_acc)

model.fit(X_scaled, y)

test_preds = model.predict(test_scaled)
test_preds_labels = y_le.inverse_transform(test_preds)

submission = pd.DataFrame({"id": test["id"], "WeightCategory": test_preds_labels})
submission.to_csv("submission_rf.csv", index=False)
print("\n✅ Submission file created: submission_rf.csv")
print("submission_rf shape:", submission.shape)
submission.head()


Train shape: (15533, 18)
Test shape: (5225, 17)
Training Accuracy: 1.0
Validation Accuracy: 0.8966849050531059

✅ Submission file created: submission_rf.csv
submission_rf shape: (5225, 2)


Unnamed: 0,id,WeightCategory
0,15533,Obesity_Type_III
1,15534,Overweight_Level_I
2,15535,Overweight_Level_II
3,15536,Obesity_Type_II
4,15537,Normal_Weight


In [None]:
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       374
           1       0.86      0.89      0.88       469
           2       0.89      0.86      0.87       441
           3       0.96      0.98      0.97       481
           4       0.99      1.00      0.99       597
           5       0.80      0.73      0.76       369
           6       0.79      0.82      0.80       376

    accuracy                           0.90      3107
   macro avg       0.89      0.89      0.89      3107
weighted avg       0.90      0.90      0.90      3107

Confusion Matrix:
[[348  24   0   0   0   2   0]
 [ 26 417   0   0   0  21   5]
 [  0   0 379  17   4  13  28]
 [  0   0   9 469   1   0   2]
 [  0   0   1   0 596   0   0]
 [  4  37  12   0   0 269  47]
 [  0   5  27   4   0  32 308]]
