# Hold out Cross Validation

### Configuration and Imports 

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline

import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

### Data Loading

In [33]:
LargeTrain = pd.read_csv('train.csv')  # Train dataset
SmallTest = pd.read_csv('Original_ObesityDataSet.csv')  # Test dataset

### Data Preprocessing

In [34]:
# Drop unnecessary columns
LargeTrain = LargeTrain.drop(columns='id')

# Extract features and target
y_train = LargeTrain['NObeyesdad']
X_train = LargeTrain.drop(columns='NObeyesdad')
y_test = SmallTest['NObeyesdad']
X_test = SmallTest.drop(columns='NObeyesdad')

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define numerical and categorical features
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'CALC', 'SCC', 'MTRANS']


### Custom Transformers

In [35]:
# Custom Transformers
class CustomLabelEncoder(BaseEstimator, TransformerMixin): #Extending BaseEstimator and TransformerMixin classes from sklearn.base
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, y, X=None):
        self.label_encoder.fit(y)
        return self

    def transform(self, y):
        return self.label_encoder.transform(y)

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features):
        self.numerical_features = numerical_features
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy = self.scaler.transform(X)
        return X_copy

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = pd.DataFrame(X, columns=numerical_features + categorical_features)

        # Map categorical features to numerical values
        X_copy['family_history_with_overweight'] = X_copy['family_history_with_overweight'].map({'yes': 1, 'no': 0})
        X_copy['FAVC'] = X_copy['FAVC'].map({'yes': 1, 'no': 0})
        X_copy['SMOKE'] = X_copy['SMOKE'].map({'yes': 1, 'no': 0})
        X_copy['SCC'] = X_copy['SCC'].map({'yes': 1, 'no': 0})
        X_copy['Gender'] = X_copy['Gender'].map({'Male': 1, 'Female': 0})

        custom_mapping = {'no': 1, 'Sometimes': 2, 'Frequently': 3, 'Always': 4}
        X_copy['CAEC'] = X_copy['CAEC'].map(custom_mapping)
        X_copy['CALC'] = X_copy['CALC'].map(custom_mapping)

        one_hot_encoder = OneHotEncoder()
        means_of_trns_encoded = pd.DataFrame(
            one_hot_encoder.fit_transform(X_copy[['MTRANS']]).toarray(),
            columns=one_hot_encoder.get_feature_names_out(['MTRANS'])) 
        
        transformed_df = X_copy.join(means_of_trns_encoded) 
        transformed_df = transformed_df.drop(["MTRANS"], axis=1)
        
        return transformed_df


In [36]:

label_encoder = LabelEncoder()

# Fit the encoder on the entire label set (assuming y_train, y_val, and y_test are all available)
label_encoder.fit(y_train)

# Transform the labels
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

### Model Pipelines

In [41]:
# Define preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features)
    ])

dt_pipeline = imPipeline(steps=[
        ('preprocessor', preprocessor),
        ('custom_transformer', CustomTransformer()),
        ('custom_scaler', CustomScaler(numerical_features)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', DecisionTreeClassifier(max_depth=20, random_state=42))
    ])

# Define the pipelines with preprocessing and classifier
logreg_pipeline = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42))
])

svm_pipeline = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', SVC(random_state=42))
])

rf_pipeline = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42))
])
xgboost_pipeline = imPipeline(steps=[
        ('preprocessor', preprocessor),
        ('custom_transformer', CustomTransformer()),
        ('custom_scaler', CustomScaler(numerical_features)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', XGBClassifier(n_estimators=200, max_depth=10, random_state=42))
    ])


### Training and Evaluation

In [42]:
# Fit the pipelines on training data
logreg_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
dt_pipeline.fit(X_train, y_train)
xgboost_pipeline.fit(X_train, y_train_encoded)

# Predict on validation data
y_val_pred_logreg = logreg_pipeline.predict(X_val)
y_val_pred_svm = svm_pipeline.predict(X_val)
y_val_pred_rf = rf_pipeline.predict(X_val)
y_val_pred_dt = dt_pipeline.predict(X_val)
y_val_pred_xgb = xgboost_pipeline.predict(X_val)

# Calculate validation accuracies
val_accuracy_logreg = accuracy_score(y_val, y_val_pred_logreg)
val_accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
val_accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
val_accuracy_xgb = accuracy_score(y_val_encoded, y_val_pred_xgb)


print(f'Validation Accuracy - Decision Tree: {val_accuracy_dt}')
print(f'Validation Accuracy - Logistic Regression: {val_accuracy_logreg}')
print(f'Validation Accuracy - SVM: {val_accuracy_svm}')
print(f'Validation Accuracy - RandomForest: {val_accuracy_rf}')
print(f'Validation Accuracy - XGBoost: {val_accuracy_xgb}')


Validation Accuracy - Decision Tree: 0.8436897880539499
Validation Accuracy - Logistic Regression: 0.8644026974951831
Validation Accuracy - SVM: 0.8595857418111753
Validation Accuracy - RandomForest: 0.8923410404624278
Validation Accuracy - XGBoost: 0.8966763005780347


In [43]:


# Predict on test data
y_test_pred_logreg = logreg_pipeline.predict(X_test)
y_test_pred_svm = svm_pipeline.predict(X_test)
y_test_pred_rf = rf_pipeline.predict(X_test)
y_test_pred_dt = dt_pipeline.predict(X_test)
y_test_pred_xgb = xgboost_pipeline.predict(X_test)

# Evaluate performance on test data
test_accuracy_logreg = accuracy_score(y_test, y_test_pred_logreg)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
test_accuracy_xgb = accuracy_score(y_test_encoded, y_test_pred_xgb)

print(f'Test Accuracy - Decision Tree: {test_accuracy_dt}')
print(f'Test Accuracy - Logistic Regression: {test_accuracy_logreg}')
print(f'Test Accuracy - SVM: {test_accuracy_svm}')
print(f'Test Accuracy - RandomForest: {test_accuracy_rf}')
print(f'Test Accuracy - XGBoost: {test_accuracy_xgb}')




Test Accuracy - Decision Tree: 0.8872572240644244
Test Accuracy - Logistic Regression: 0.9062055897678826
Test Accuracy - SVM: 0.9047844623401232
Test Accuracy - RandomForest: 0.9379441023211748
Test Accuracy - XGBoost: 0.9478919943154903


In [44]:
# Generate confusion matrices
conf_matrix_logreg = confusion_matrix(y_test, y_test_pred_logreg)
conf_matrix_svm = confusion_matrix(y_test, y_test_pred_svm)
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)

print('Confusion Matrix - Logistic Regression:')
print(conf_matrix_logreg)
print('Confusion Matrix - SVM:')
print(conf_matrix_svm)
print('Confusion Matrix - RandomForest:')
print(conf_matrix_rf)

# Generate classification reports
class_report_logreg = classification_report(y_test, y_test_pred_logreg)
class_report_svm = classification_report(y_test, y_test_pred_svm)
class_report_rf = classification_report(y_test, y_test_pred_rf)
class_report_dt = classification_report(y_test, y_test_pred_dt)
class_report_xgb = classification_report(y_test_encoded, y_test_pred_xgb)

print('Classification Report - Logistic Regression:')
print(class_report_logreg)
print('Classification Report - SVM:')
print(class_report_svm)
print('Classification Report - RandomForest:')
print(class_report_rf)

print('Classification Report - Decision Tree:')
print(class_report_dt)

print('Classification Report - XGBoost:')
print(class_report_rf)

Confusion Matrix - Logistic Regression:
[[270   2   0   0   0   0   0]
 [ 27 215   0   0   0  43   2]
 [  0   0 322  18   0   0  11]
 [  0   0   4 293   0   0   0]
 [  0   0   1   1 322   0   0]
 [  0   4   0   0   0 251  35]
 [  0   0  27   0   0  23 240]]
Confusion Matrix - SVM:
[[261  11   0   0   0   0   0]
 [ 17 220   1   0   0  38  11]
 [  0   3 319  18   0   3   8]
 [  0   1   2 291   0   3   0]
 [  0   1   1   1 321   0   0]
 [  0  16   2   0   0 242  30]
 [  0   5   9   4   0  16 256]]
Confusion Matrix - RandomForest:
[[264   8   0   0   0   0   0]
 [  6 253   0   0   0  23   5]
 [  0   0 329  14   0   2   6]
 [  0   1   1 294   1   0   0]
 [  0   0   1   1 322   0   0]
 [  0  12   2   0   0 260  16]
 [  0   4   8   3   0  17 258]]
Classification Report - Logistic Regression:
                     precision    recall  f1-score   support

Insufficient_Weight       0.91      0.99      0.95       272
      Normal_Weight       0.97      0.75      0.85       287
     Obesity_Type_I 