# üèÜ Machine Learning Competition Notebook
This notebook contains a full ML pipeline including preprocessing, EDA, model training, tuning, and submission file generation.

## üìå 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

print('Libraries imported successfully!')

## üìå 2. Load Dataset

In [None]:
train_path = "train.csv"
test_path = "test.csv"
target_col = "NObeyesdad"
id_col = "id"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print('Train Shape:', train.shape)
print('Test Shape:', test.shape)

## üìå 3. Data Overview

In [None]:
train.head()

## üìå 4. Identify Numeric & Categorical Columns

In [None]:
y = train[target_col]
X = train.drop(columns=[target_col])
test_ids = test[id_col]

cat_cols = [col for col in X.columns if X[col].dtype == "object"]
num_cols = [col for col in X.columns if X[col].dtype != "object"]

cat_cols, num_cols

## üìä 5. Exploratory Data Analysis (Optional)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=train, x=target_col)
plt.xticks(rotation=45)
plt.show()

for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(train[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

## üìà 6. Outlier Analysis (Optional)

In [None]:
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=train[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

## üî• 7. Correlation Heatmap (Optional)

In [None]:
corr_df = train[num_cols].copy()
for col in corr_df.columns:
    corr_df[col].fillna(corr_df[col].median(), inplace=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr_df.corr(), cmap='coolwarm')
plt.show()

## ‚öôÔ∏è 8. Preprocessing Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

## ü§ñ 9. Build Random Forest Model

In [None]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])

## ‚úÇÔ∏è 10. Train/Validation Split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## üèãÔ∏è 11. Train Model & Evaluate

In [None]:
model.fit(X_train, y_train)
preds = model.predict(X_valid)

print('Accuracy:', accuracy_score(y_valid, preds))
print(classification_report(y_valid, preds))

## üéØ 12. Hyperparameter Tuning

In [None]:
param_grid = {
    "clf__n_estimators": [200, 300, 500],
    "clf__max_depth": [10, 20, None],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2]
}

grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_model

## üß™ 13. Generate Predictions

In [None]:
test_preds = best_model.predict(test)

## üì§ 14. Create Submission File

In [None]:
submission = pd.DataFrame({
    id_col: test_ids,
    target_col: test_preds
})

submission.to_csv('submission.csv', index=False)
print('submission.csv created successfully!')