In [None]:
# Section 1 — Imports & setup
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print('Imports complete')

In [None]:
# Section 2 — Load and inspect data
csv_path = 'ObesityDataset.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print('Loaded ObesityDataset.csv')
else:
    # Synthesize a tiny sample to allow running the notebook when dataset is absent
    print('ObesityDataset.csv not found — creating a small synthetic sample')
    df = pd.DataFrame({
        'Gender': np.random.choice(['Male','Female'], 100),
        'Age': np.random.randint(18, 70, 100),
        'family_history_with_overweight': np.random.choice(['yes','no'], 100),
        'FAVC': np.random.choice(['yes','no'], 100),
        'FCVC': np.random.randint(1,5,100),
        'NObeyesdad': np.random.choice(['Underweight','Normal','Overweight','Obesity I','Obesity II','Obesity III'], 100)
    })

print('Shape:', df.shape)
print(df.dtypes)
df.head()

In [None]:
# Section 3 — Transformers and Pipeline helper classes
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Selects a subset of columns from a DataFrame."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

class DropUnused(BaseEstimator, TransformerMixin):
    """Drop columns not needed for modeling (placeholder)."""
    def __init__(self, cols_to_drop=None):
        self.cols_to_drop = cols_to_drop or []
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')


class PipelineBuilder:
    """Helper to construct scikit-learn Pipelines with consistent preprocessing."""
    def __init__(self, numeric_cols, categorical_cols, random_state=42):
        self.numeric_cols = numeric_cols
        self.categorical_cols = categorical_cols
        self.random_state = random_state

    def build_preprocessor(self):
        from sklearn.preprocessing import OneHotEncoder, StandardScaler
        from sklearn.compose import ColumnTransformer
        numeric_transformer = Pipeline(steps=[
            ('selector', ColumnSelector(self.numeric_cols)),
            ('scaler', StandardScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ('selector', ColumnSelector(self.categorical_cols)),
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
        preprocessor = ColumnTransformer(transformers=[
            ('num', numeric_transformer, self.numeric_cols),
            ('cat', categorical_transformer, self.categorical_cols)
        ])
        return preprocessor

    def build_pipeline(self, estimator):
        from sklearn.pipeline import Pipeline
        preproc = self.build_preprocessor()
        pipe = Pipeline(steps=[('preproc', preproc), ('clf', estimator)])
        return pipe

In [None]:
# Section 4 — Target encoding, column selection, and train/test split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os

# Encode target
label_encoder_target = LabelEncoder()
df['NObeyesdad_encoded'] = label_encoder_target.fit_transform(df['NObeyesdad'])
target_mapping = dict(zip(label_encoder_target.classes_, label_encoder_target.transform(label_encoder_target.classes_)))

# Features / target
X = df.drop(columns=['NObeyesdad', 'NObeyesdad_encoded']).copy()
y = df['NObeyesdad_encoded']

# Column lists
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

print('Numeric cols:', numeric_cols)
print('Categorical cols:', categorical_cols)

# Stratified split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Split sizes — train:', X_train.shape, 'test:', X_test.shape)

In [None]:
# Section 5 — Build & train pipelines; save artifacts
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import joblib

os.makedirs('pipelines', exist_ok=True)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Support Vector Machine': SVC(kernel='rbf', probability=True, random_state=42),
    'Naive Bayes': GaussianNB()
}

results = []
builder = PipelineBuilder(numeric_cols=numeric_cols, categorical_cols=categorical_cols, random_state=42)

for name, estimator in models.items():
    print(f"Training pipeline: {name} ...")
    pipe = builder.build_pipeline(estimator)
    pipe.fit(X_train, y_train)
    train_acc = pipe.score(X_train, y_train)
    results.append({'Model': name, 'Train_Accuracy': train_acc})
    fname = f"pipelines/{name.replace(' ', '_')}.joblib"
    joblib.dump({'pipeline': pipe, 'target_encoder': label_encoder_target, 'target_mapping': target_mapping}, fname)
    print(f"Saved pipeline to {fname} (train_acc={train_acc:.4f})")

results_df = pd.DataFrame(results).sort_values('Train_Accuracy', ascending=False)
results_df.to_csv('pipelines/results_summary.csv', index=False)
print("Saved results summary to pipelines/results_summary.csv")
print(results_df)