# Approach 3 

## Using large dataset for training and small dataset for testing and doing train-test split

### Configuration and Imports 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

### Data Loading

In [2]:
LargeTrain = pd.read_csv('train.csv')  # Original dataset
SmallTest = pd.read_csv('Original_ObesityDataSet.csv')  # Test dataset

### Data Preprocessing

In [3]:
# Drop unnecessary columns
LargeTrain = LargeTrain.drop(columns='id')

# Extract features and target
y_train = LargeTrain['NObeyesdad']
X_train = LargeTrain.drop(columns='NObeyesdad')
y_test = SmallTest['NObeyesdad']
X_test = SmallTest.drop(columns='NObeyesdad')

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define numerical and categorical features
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'CALC', 'SCC', 'MTRANS']


### Custom Transformers

In [4]:
# Custom Transformers
class CustomLabelEncoder(BaseEstimator, TransformerMixin): #Extending BaseEstimator and TransformerMixin classes from sklearn.base
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, y, X=None):
        self.label_encoder.fit(y)
        return self

    def transform(self, y):
        return self.label_encoder.transform(y)

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features):
        self.numerical_features = numerical_features
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy = self.scaler.transform(X)
        return X_copy

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = pd.DataFrame(X, columns=numerical_features + categorical_features)

        # Map categorical features to numerical values
        X_copy['family_history_with_overweight'] = X_copy['family_history_with_overweight'].map({'yes': 1, 'no': 0})
        X_copy['FAVC'] = X_copy['FAVC'].map({'yes': 1, 'no': 0})
        X_copy['SMOKE'] = X_copy['SMOKE'].map({'yes': 1, 'no': 0})
        X_copy['SCC'] = X_copy['SCC'].map({'yes': 1, 'no': 0})
        X_copy['Gender'] = X_copy['Gender'].map({'Male': 1, 'Female': 0})

        custom_mapping = {'no': 1, 'Sometimes': 2, 'Frequently': 3, 'Always': 4}
        X_copy['CAEC'] = X_copy['CAEC'].map(custom_mapping)
        X_copy['CALC'] = X_copy['CALC'].map(custom_mapping)

        one_hot_encoder = OneHotEncoder()
        means_of_trns_encoded = pd.DataFrame(
            one_hot_encoder.fit_transform(X_copy[['MTRANS']]).toarray(),
            columns=one_hot_encoder.get_feature_names_out(['MTRANS'])) 
        
        transformed_df = X_copy.join(means_of_trns_encoded) 
        transformed_df = transformed_df.drop(["MTRANS"], axis=1)
        
        return transformed_df


### Model Pipelines

In [5]:
# Define preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features)
    ])

# Define the pipelines with preprocessing and classifier
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('classifier', SVC(random_state=42))
])

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('custom_transformer', CustomTransformer()),
    ('custom_scaler', CustomScaler(numerical_features)),
    ('classifier', RandomForestClassifier(random_state=42))
])

### Training and Evaluation

In [6]:
# Fit the pipelines on training data
logreg_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

# Predict on validation data
y_val_pred_logreg = logreg_pipeline.predict(X_val)
y_val_pred_svm = svm_pipeline.predict(X_val)
y_val_pred_rf = rf_pipeline.predict(X_val)

# Calculate validation accuracies
val_accuracy_logreg = accuracy_score(y_val, y_val_pred_logreg)
val_accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)

print(f'Validation Accuracy - Logistic Regression: {val_accuracy_logreg}')
print(f'Validation Accuracy - SVM: {val_accuracy_svm}')
print(f'Validation Accuracy - RandomForest: {val_accuracy_rf}')

# Predict on test data
y_test_pred_logreg = logreg_pipeline.predict(X_test)
y_test_pred_svm = svm_pipeline.predict(X_test)
y_test_pred_rf = rf_pipeline.predict(X_test)

# Evaluate performance on test data
test_accuracy_logreg = accuracy_score(y_test, y_test_pred_logreg)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)

print(f'Test Accuracy - Logistic Regression: {test_accuracy_logreg}')
print(f'Test Accuracy - SVM: {test_accuracy_svm}')
print(f'Test Accuracy - RandomForest: {test_accuracy_rf}')

# Generate confusion matrices
conf_matrix_logreg = confusion_matrix(y_test, y_test_pred_logreg)
conf_matrix_svm = confusion_matrix(y_test, y_test_pred_svm)
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)

print('Confusion Matrix - Logistic Regression:')
print(conf_matrix_logreg)
print('Confusion Matrix - SVM:')
print(conf_matrix_svm)
print('Confusion Matrix - RandomForest:')
print(conf_matrix_rf)

# Generate classification reports
class_report_logreg = classification_report(y_test, y_test_pred_logreg)
class_report_svm = classification_report(y_test, y_test_pred_svm)
class_report_rf = classification_report(y_test, y_test_pred_rf)

print('Classification Report - Logistic Regression:')
print(class_report_logreg)
print('Classification Report - SVM:')
print(class_report_svm)
print('Classification Report - RandomForest:')
print(class_report_rf)


Validation Accuracy - Logistic Regression: 0.8622350674373795
Validation Accuracy - SVM: 0.8562138728323699
Validation Accuracy - RandomForest: 0.8942678227360308
Test Accuracy - Logistic Regression: 0.9024159166271909
Test Accuracy - SVM: 0.8995736617716722
Test Accuracy - RandomForest: 0.9341544291804832
Confusion Matrix - Logistic Regression:
[[270   2   0   0   0   0   0]
 [ 25 229   0   0   0  30   3]
 [  0   0 318  22   0   0  11]
 [  0   0   3 294   0   0   0]
 [  0   0   1   1 322   0   0]
 [  0   8   0   0   0 241  41]
 [  0   0  33   4   0  22 231]]
Confusion Matrix - SVM:
[[261  11   0   0   0   0   0]
 [ 15 223   1   0   0  33  15]
 [  0   4 315  22   0   5   5]
 [  0   1   2 291   0   3   0]
 [  0   1   1   1 321   0   0]
 [  0  20   2   0   0 231  37]
 [  0   7   9   4   0  13 257]]
Confusion Matrix - RandomForest:
[[264   8   0   0   0   0   0]
 [  6 255   0   0   0  23   3]
 [  0   0 330  14   0   1   6]
 [  0   1   2 294   0   0   0]
 [  0   0   2   0 322   0   0]
 [  

In [7]:
# Function to calculate TP, FP, TN, FN for each class
def calculate_class_metrics(y_test, y_pred, classes):
    TP = {cls: np.sum((y_test == cls) & (y_pred == cls)) for cls in classes}
    FP = {cls: np.sum((y_test != cls) & (y_pred == cls)) for cls in classes}
    TN = {cls: np.sum((y_test != cls) & (y_pred != cls)) for cls in classes}
    FN = {cls: np.sum((y_test == cls) & (y_pred != cls)) for cls in classes}

    return TP, FP, TN, FN

# Get unique classes
classes = np.unique(y_test)

# Calculate metrics for each model
print("Logistic Regression:")
TP_logreg, FP_logreg, TN_logreg, FN_logreg = calculate_class_metrics(y_test, y_test_pred_logreg, classes)
print("TP:", TP_logreg, "FP:", FP_logreg, "TN:", TN_logreg, "FN:", FN_logreg)

print("\nSupport Vector Machine:")
TP_svm, FP_svm, TN_svm, FN_svm = calculate_class_metrics(y_test, y_test_pred_svm, classes)
print("TP:", TP_svm, "FP:", FP_svm, "TN:", TN_svm, "FN:", FN_svm)

print("\nRandom Forest:")
TP_rf, FP_rf, TN_rf, FN_rf = calculate_class_metrics(y_test, y_test_pred_rf, classes)
print("TP:", TP_rf, "FP:", FP_rf, "TN:", TN_rf, "FN:", FN_rf)


Logistic Regression:
TP: {'Insufficient_Weight': 270, 'Normal_Weight': 229, 'Obesity_Type_I': 318, 'Obesity_Type_II': 294, 'Obesity_Type_III': 322, 'Overweight_Level_I': 241, 'Overweight_Level_II': 231} FP: {'Insufficient_Weight': 25, 'Normal_Weight': 10, 'Obesity_Type_I': 37, 'Obesity_Type_II': 27, 'Obesity_Type_III': 0, 'Overweight_Level_I': 52, 'Overweight_Level_II': 55} TN: {'Insufficient_Weight': 1814, 'Normal_Weight': 1814, 'Obesity_Type_I': 1723, 'Obesity_Type_II': 1787, 'Obesity_Type_III': 1787, 'Overweight_Level_I': 1769, 'Overweight_Level_II': 1766} FN: {'Insufficient_Weight': 2, 'Normal_Weight': 58, 'Obesity_Type_I': 33, 'Obesity_Type_II': 3, 'Obesity_Type_III': 2, 'Overweight_Level_I': 49, 'Overweight_Level_II': 59}

Support Vector Machine:
TP: {'Insufficient_Weight': 261, 'Normal_Weight': 223, 'Obesity_Type_I': 315, 'Obesity_Type_II': 291, 'Obesity_Type_III': 321, 'Overweight_Level_I': 231, 'Overweight_Level_II': 257} FP: {'Insufficient_Weight': 15, 'Normal_Weight': 44, 'O