In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
df_pred = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

# **1. Train and Test Dataset Split:**

In [None]:
df_train, df_test = train_test_split(
    df,
    test_size=0.3,       # 30% for test
    random_state=42,
    shuffle=True
)

# **2. Exploratory Data Analysis:**

Target Distribution:

In [None]:
sns.countplot(x=df_train['WeightCategory'])

Bivariate Analysis (WeightCategory with Numerical features):

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['Age'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['Height'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['Weight'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['FCVC'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['NCP'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['CH2O'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['FAF'])

In [None]:
sns.boxplot(x=df_train['WeightCategory'], y=df_train['TUE'])

Bivariate Analysis (WeightCategory with categorical features):

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['Gender'])

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['family_history_with_overweight'])

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['FAVC'])

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['SMOKE'])

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['SCC'])

In [None]:
sns.countplot(x=df_train['WeightCategory'], hue=df_train['MTRANS'])

Heatmap for Numerical features:

In [None]:
numerical_df_train = df_train.select_dtypes(include=np.number, exclude=['bool'])

categorical_df_train = df_train.select_dtypes(include=['object', 'category', 'bool'])
categorical_df_train.drop(['CAEC', 'CALC'], axis=1, inplace=True)

In [None]:
numerical_df_train.sample(10)

In [None]:
categorical_df_train.sample(10)

In [None]:
import matplotlib.pyplot as plt

num_df = numerical_df_train.select_dtypes(include=np.number)

correlation_matrix = num_df.corr()

plt.figure(figsize=(10, 7))

sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap='viridis',
    fmt=".2f",
    linewidths=.5,
    cbar=True,
    square=True
)

plt.title('Heatmap of Pairwise Correlation Between Numerical Features', fontsize=16)

plt.show()

# **3. Feature Engineering:**

In [None]:
df_train['BMI'] = df_train['Weight']/df_train['Height']
df_test['BMI'] = df_test['Weight']/df_test['Height']
df_pred['BMI'] = df_pred['Weight']/df_pred['Height']

In [None]:
df_train['AI'] = df_train['FAF']-df_train['TUE']
df_test['AI'] = df_test['FAF']-df_test['TUE']
df_pred['AI'] = df_pred['FAF']-df_pred['TUE']

In [None]:
df_train['FoodConsumption'] = df_train['FCVC']*df_train['NCP']
df_test['FoodConsumption'] = df_test['FCVC']*df_test['NCP']
df_pred['FoodConsumption'] = df_pred['FCVC']*df_pred['NCP']

In [None]:
df_train['snacking_alcohol_habit'] = df_train['CAEC'].astype(str)+'_'+df_train['CALC'].astype(str)
df_test['snacking_alcohol_habit'] = df_test['CAEC'].astype(str)+'_'+df_test['CALC'].astype(str)
df_pred['snacking_alcohol_habit'] = df_pred['CAEC'].astype(str)+'_'+df_pred['CALC'].astype(str)

# **4. Data Preprocessing:**

Encoding and scaling features (for train dataset):

In [None]:
x_train = df_train.drop(['WeightCategory', 'id', 'CALC'], axis=1, errors='ignore')
y_train = df_train['WeightCategory']
categorical_features = x_train.select_dtypes(include=['object']).columns
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns
transformer = ColumnTransformer(
    transformers=[
        # Transformer 1: Encodes all your text columns
        ('tnf', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), categorical_features),

        # Transformer 2: Scales all your original numerical columns
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)
x_train_processed_array = transformer.fit_transform(x_train)
x_train_column_names = transformer.get_feature_names_out()
df_train_processed_x = pd.DataFrame(x_train_processed_array, columns=x_train_column_names)

pd.set_option('display.max_columns', None)
df_train_processed_x.sample(10)

Label Encoding:

In [None]:
le = LabelEncoder()
y_train_processed = le.fit_transform(y_train)

df_train_processed_y = pd.DataFrame(y_train_processed, columns=['WeightCategory'])

Encoding and scaling features (for test dataset):

In [None]:
x_test = df_test.drop(['WeightCategory', 'id', 'CALC'], axis=1, errors='ignore')
y_test = df_test['WeightCategory']
categorical_features = x_test.select_dtypes(include=['object']).columns
numerical_features = x_test.select_dtypes(include=['int64', 'float64']).columns

x_test_processed_array = transformer.transform(x_test)
x_test_column_names = transformer.get_feature_names_out()
df_test_processed_x = pd.DataFrame(x_test_processed_array, columns=x_test_column_names)

pd.set_option('display.max_columns', None)
df_test_processed_x.sample(10)

In [None]:
y_test_processed = le.transform(y_test)

df_test_processed_y = pd.DataFrame(y_test_processed, columns=['WeightCategory'])

In [None]:
df_pred_x = df_pred.drop(['WeightCategory', 'id', 'CALC'], axis=1, errors='ignore')
categorical_features = df_pred_x.select_dtypes(include=['object']).columns
numerical_features = df_pred_x.select_dtypes(include=['int64', 'float64']).columns

x_pred_processed_array = transformer.transform(df_pred_x)
x_pred_column_names = transformer.get_feature_names_out()
x_pred_processed_array.shape
df_pred_processed_x = pd.DataFrame(x_pred_processed_array, columns=x_pred_column_names)


In [None]:
pd.set_option('display.max_columns', None)
df_pred_processed_x.sample(10)

# **5. Hyperparameter Tuning:**

Randomized Search CV:

In [None]:
xgb_base = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    tree_method='hist',
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

param_dist = {
    'n_estimators': np.arange(400, 700, 20),
    'learning_rate': np.linspace(0.01, 0.05, 10),
    'max_depth': np.arange(4, 12),
    'min_child_weight': np.arange(1, 5),
    'gamma': np.linspace(0, 1, 20),
    'subsample': np.linspace(0.4, 1, 10),
    'colsample_bytree': np.linspace(0.4, 1, 10),
    'reg_alpha': np.linspace(0, 1, 20),
    'reg_lambda': np.linspace(0.5, 3, 20),
    'min_child_samples': np.arange(10, 30, 2),
    'num_leaves': np.arange(50, 120, 5)
}

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


random_search.fit(df_train_processed_x, df_train_processed_y)

print("Best Parameters:", random_search.best_params_)

xgb_model = XGBClassifier(
    n_estimators=582,
    learning_rate=0.035,
    max_depth=9,
    min_child_samples=26,
    min_child_weight=2,
    num_leaves=99,
    gamma=0.5916,
    reg_alpha=0.4493,
    reg_lambda=2.0999,
    subsample=0.4760,
    colsample_bytree=0.55,
    tree_method='hist',
    grow_policy='depthwise',
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1
)

xgb_model.fit(df_train_processed_x, df_train_processed_y)

In [None]:
# Best Hyperparameter

# xgb_model = XGBClassifier(
#     # Core Learning:
#     n_estimators = 582,
#     learning_rate = 0.035,
#     max_depth = 9,
#     min_child_samples = 26,
#     min_child_weight=2,
#     num_leaves = 99,

#     # Regularization:
#     gamma = 0.5915864928857023,
#     reg_alpha = 0.44930608551240997,
#     reg_lambda = 2.099943285494622,

#     # Sampling:
#     subsample = 0.4760427193925438,
#     colsample_bytree = 0.55,

#     # Tree Method:
#     tree_method='hist',
#     grow_policy='depthwise',

#     # Other stable params:
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     n_jobs=-1
# )

# # Fit and evaluate
# xgb_model.fit(df_train_processed_x, df_train_processed_y)


# **6. Prediction:**

In [None]:
y_test_pred = xgb_model.predict(df_test_processed_x)

df_test_pred_y = pd.DataFrame({'WeightCategory': y_test_pred})

In [None]:
accuracy = accuracy_score(df_test_processed_y, df_test_pred_y)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

y_true_numerical = df_test_processed_y['WeightCategory'].values.ravel()

y_pred_numerical = df_test_pred_y['WeightCategory'].values.ravel()

all_unique_codes = np.unique(np.concatenate([y_true_numerical, y_pred_numerical]))

class_names = [
    '0 - Insufficient_Weight',
    '1 - Normal_Weight',
    '2 - Obesity_Type_I',
    '3 - Obesity_Type_II',
    '4 - Obesity_Type_III',
    '5 - Overweight_Level_I',
    '6 - Overweight_Level_II'
]

cm = confusion_matrix(y_true_numerical, y_pred_numerical)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,

    display_labels=class_names
)

fig, ax = plt.subplots(figsize=(5, 5))
disp.plot(
    cmap=plt.cm.Blues,
    ax=ax,
    values_format='d',
    xticks_rotation='vertical'
)

plt.title("Confusion Matrix with Weight Categories")
plt.show()

In [None]:
y_pred = xgb_model.predict(df_pred_processed_x)
y_pred_labels = le.inverse_transform(y_pred)

df_pred_y = pd.DataFrame({'id': df_pred['id'], 'WeightCategory': y_pred_labels})

In [None]:
df_pred_y.head(5)

In [None]:
# For downloading Output file

# df_pred_y.to_csv('Output_Data.csv', index=False)
# from google.colab import files
# files.download('Output_Data.csv')