# Task 2: Model Training
This notebook implements data preprocessing, feature engineering, and model training for obesity classification.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load and Prepare Data

In [None]:
# Load the dataset
df = pd.read_csv('ObesityDataset.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nTarget classes: {df['NObeyesdad'].unique()}")
print(f"\nClass distribution:")
print(df['NObeyesdad'].value_counts())

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Encode target variable
label_encoder_target = LabelEncoder()
df_processed['NObeyesdad_encoded'] = label_encoder_target.fit_transform(df_processed['NObeyesdad'])

# Store the mapping
target_mapping = dict(zip(label_encoder_target.classes_, label_encoder_target.transform(label_encoder_target.classes_)))
print("Target variable encoding:")
for original, encoded in sorted(target_mapping.items(), key=lambda x: x[1]):
    print(f"  {encoded}: {original}")

# Separate features and target
X = df_processed.drop(['NObeyesdad', 'NObeyesdad_encoded'], axis=1)
y = df_processed['NObeyesdad_encoded']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

### 2.1 Encode Categorical Features

In [None]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Encode categorical features
label_encoders = {}
X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    label_encoders[col] = le
    print(f"\n{col} encoding:")
    for original, encoded in zip(le.classes_, le.transform(le.classes_)):
        print(f"  {encoded}: {original}")

print(f"\nEncoded feature matrix shape: {X_encoded.shape}")
print("\nFirst few rows after encoding:")
print(X_encoded.head())

## 3. Split Data into Training and Testing Sets

In [None]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTesting set class distribution:")
print(y_test.value_counts().sort_index())

## 4. Feature Scaling

In [None]:
# Scale features (important for distance-based algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")
print(f"\nScaled training set shape: {X_train_scaled.shape}")
print(f"Scaled testing set shape: {X_test_scaled.shape}")
print(f"\nFeature means after scaling (should be close to 0):")
print(np.mean(X_train_scaled, axis=0))
print(f"\nFeature stds after scaling (should be close to 1):")
print(np.std(X_train_scaled, axis=0))

## 5. Train Multiple Classification Models

In [None]:
# Define multiple models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Support Vector Machine': SVC(kernel='rbf', random_state=42),
    'Naive Bayes': GaussianNB()
}

print(f"Training {len(models)} models...")
print("=" * 80)

In [None]:
# Train all models and store them
trained_models = {}
training_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for models that benefit from scaling
    if name in ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes']:
        model.fit(X_train_scaled, y_train)
        train_score = model.score(X_train_scaled, y_train)
    else:
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
    
    trained_models[name] = model
    training_scores[name] = train_score
    
    print(f"  Training accuracy: {train_score:.4f}")

print("\n" + "=" * 80)
print("All models trained successfully!")

## 6. Training Accuracy Comparison

In [None]:
# Create a summary of training accuracies
training_results = pd.DataFrame({
    'Model': list(training_scores.keys()),
    'Training Accuracy': list(training_scores.values())
}).sort_values('Training Accuracy', ascending=False)

print("\nTraining Accuracy Summary:")
print("=" * 60)
print(training_results.to_string(index=False))
print("=" * 60)

# Visualize training accuracies
plt.figure(figsize=(12, 6))
plt.barh(training_results['Model'], training_results['Training Accuracy'], color='skyblue')
plt.xlabel('Training Accuracy')
plt.title('Model Training Accuracy Comparison')
plt.xlim([0, 1])
for i, v in enumerate(training_results['Training Accuracy']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')
plt.tight_layout()
plt.show()

## 7. Save Models and Preprocessing Objects

In [None]:
# Save models and preprocessing objects for use in Task 3
import pickle

# Save all trained models
with open('trained_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)

# Save preprocessing objects
preprocessing_objects = {
    'label_encoders': label_encoders,
    'label_encoder_target': label_encoder_target,
    'scaler': scaler,
    'target_mapping': target_mapping,
    'feature_names': X_encoded.columns.tolist()
}

with open('preprocessing_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

# Save train/test split for evaluation
split_data = {
    'X_train': X_train,
    'X_test': X_test,
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test
}

with open('split_data.pkl', 'wb') as f:
    pickle.dump(split_data, f)

print("✓ Models saved to 'trained_models.pkl'")
print("✓ Preprocessing objects saved to 'preprocessing_objects.pkl'")
print("✓ Train/test split saved to 'split_data.pkl'")
print("\nAll data ready for evaluation in Task 3!")

## Section 8 — Evaluation

This section computes the required evaluation metrics (Accuracy, Confusion Matrix, Macro Precision, Macro Recall, Macro F1-score, Macro ROC–AUC), saves per-model predictions/probabilities, writes confusion matrix plots, exports `pipelines/results_summary_eval.csv`, and prepares deployment artifacts (`pipelines/best_model.joblib`, `pipelines/feature_order.json`, `pipelines/target_mapping.json`, and `pipelines/task3.py`).


In [None]:
# Section 8 — Evaluation (compute required metrics, save predictions/probas, and prepare deployment artifacts)
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import label_binarize
import seaborn as sns
import matplotlib.pyplot as plt

os.makedirs('pipelines', exist_ok=True)

# Helper: try to get variables from namespace or fall back to saved files
try:
    models_dict = trained_models
    print('Using in-memory trained models')
except NameError:
    import pickle
    with open('trained_models.pkl','rb') as f:
        models_dict = pickle.load(f)
    print('Loaded models from trained_models.pkl')

try:
    X_test
    y_test
    X_test_scaled
    print('Using in-memory split data')
except NameError:
    import pickle
    with open('split_data.pkl','rb') as f:
        split_data = pickle.load(f)
    X_test = split_data.get('X_test')
    y_test = split_data.get('y_test')
    X_test_scaled = split_data.get('X_test_scaled')
    print('Loaded split_data.pkl')

# Try to load preprocessing objects if available
preproc_obj = None
if os.path.exists('preprocessing_objects.pkl'):
    import pickle
    with open('preprocessing_objects.pkl','rb') as f:
        preproc_obj = pickle.load(f)
    print('Loaded preprocessing_objects.pkl')

# Classes and number of classes
if hasattr(list(models_dict.values())[0], 'classes_'):
    clf_classes = list(models_dict.values())[0].classes_
elif preproc_obj and 'label_encoder_target' in preproc_obj:
    clf_classes = preproc_obj['label_encoder_target'].classes_
else:
    # fallback: infer from y_test
    clf_classes = sorted(y_test.unique()) if hasattr(y_test, 'unique') else list(sorted(set(y_test)))

n_classes = len(clf_classes)
print('Detected classes:', clf_classes)

# Define which models were trained on scaled data in this notebook (mirror training logic)
scaled_models = ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes']

results = []

for name, model in models_dict.items():
    print(f"Evaluating {name} ...")

    # Select appropriate test input (scaled vs unscaled) according to training logic
    if name in scaled_models:
        X_eval = X_test_scaled if 'X_test_scaled' in globals() and X_test_scaled is not None else X_test
    else:
        X_eval = X_test

    # Predictions
    try:
        y_pred = model.predict(X_eval)
    except Exception as e:
        print(f"Failed to predict with {name}: {e}")
        continue

    # Probabilities or decision scores
    y_proba = None
    y_score = None
    if hasattr(model, 'predict_proba'):
        try:
            y_proba = model.predict_proba(X_eval)
        except Exception:
            y_proba = None
    if y_proba is None and hasattr(model, 'decision_function'):
        try:
            y_score = model.decision_function(X_eval)
        except Exception:
            y_score = None

    # Save predictions
    preds_df = pd.DataFrame({'y_true': y_test.reset_index(drop=True), 'y_pred': y_pred})
    preds_path = f"pipelines/{name.replace(' ', '_')}_predictions.csv"
    preds_df.to_csv(preds_path, index=False)

    # Save probabilities if available
    if y_proba is not None:
        proba_df = pd.DataFrame(y_proba, columns=[str(c) for c in (model.classes_ if hasattr(model, 'classes_') else clf_classes)])
        proba_path = f"pipelines/{name.replace(' ', '_')}_probas.csv"
        proba_df.to_csv(proba_path, index=False)
    else:
        proba_path = None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    # ROC-AUC (macro) using probabilities if available, else decision function if available
    auc_macro = None
    try:
        y_test_bin = label_binarize(y_test, classes=range(n_classes))
        if y_proba is not None:
            auc_macro = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovr')
        elif y_score is not None:
            # decision function may return shape (n_samples, n_classes) for multiclass
            auc_macro = roc_auc_score(y_test_bin, y_score, average='macro', multi_class='ovr')
    except Exception as e:
        auc_macro = None

    # Confusion matrix and figure
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=clf_classes, yticklabels=clf_classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix — {name}')
    cm_path = f"pipelines/{name.replace(' ', '_')}_confusion_matrix.png"
    plt.tight_layout()
    plt.savefig(cm_path)
    plt.close()

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision_macro': prec,
        'Recall_macro': rec,
        'F1_macro': f1,
        'ROC_AUC_macro': auc_macro,
        'Predictions_CSV': preds_path,
        'Probas_CSV': proba_path,
        'Confusion_Matrix_PNG': cm_path
    })
    print(f"  Saved predictions to {preds_path}")
    if proba_path:
        print(f"  Saved probabilities to {proba_path}")

# Summary CSV
results_df = pd.DataFrame(results).sort_values(['F1_macro','ROC_AUC_macro'], ascending=False, na_position='last')
results_df.to_csv('pipelines/results_summary_eval.csv', index=False)
print('\nSaved evaluation summary to pipelines/results_summary_eval.csv')
print(results_df)

# Save the best model for deployment (choose by F1_macro primary)
best_row = results_df.iloc[0]
best_name = best_row['Model']
best_model = models_dict[best_name]

# Save best model and copy preprocessing objects
import joblib
joblib.dump(best_model, 'pipelines/best_model.joblib')
if preproc_obj is not None:
    with open('pipelines/preprocessing_objects.pkl','wb') as f:
        import pickle
        pickle.dump(preproc_obj, f)

# Save feature order and target mapping for deployment
feature_order = list(X_test.columns)
with open('pipelines/feature_order.json','w') as f:
    json.dump(feature_order, f)

# target mapping: try to get from label encoder
target_mapping = None
if preproc_obj and 'label_encoder_target' in preproc_obj:
    le = preproc_obj['label_encoder_target']
    target_mapping = {str(i): str(c) for i, c in enumerate(le.classes_)}
else:
    try:
        unique_classes = sorted(y_test.unique()) if hasattr(y_test, 'unique') else list(sorted(set(y_test)))
        target_mapping = {str(i): str(c) for i,c in enumerate(unique_classes)}
    except Exception:
        target_mapping = None

if target_mapping:
    with open('pipelines/target_mapping.json','w') as f:
        json.dump(target_mapping, f)

print(f"Saved best model: pipelines/best_model.joblib (model: {best_name})")
print('Saved feature_order.json and target_mapping.json for deployment')

# Write a minimal Gradio app template (task3.py) into pipelines/
app_template = r"""
import joblib
import json
import pandas as pd
import gradio as gr

# Load artifacts
model = joblib.load('pipelines/best_model.joblib')
with open('pipelines/feature_order.json') as f:
    feature_order = json.load(f)
with open('pipelines/target_mapping.json') as f:
    target_mapping = json.load(f)

# If preprocessing_objects.pkl exists, load it to apply the same preprocessing used in training
try:
    import pickle
    with open('pipelines/preprocessing_objects.pkl', 'rb') as f:
        preproc = pickle.load(f)
except Exception:
    preproc = None

classes = list(target_mapping.values())

def preprocess_input(row: pd.DataFrame):
    '''Apply the same transformations used during training if preprocessing objects are available.'''
    df = row.copy()
    if preproc is not None:
        # Apply label encoders if available
        les = preproc.get('label_encoders')
        scaler = preproc.get('scaler')
        if les:
            for col, le in les.items():
                if col in df:
                    df[col] = le.transform(df[col])
        if scaler is not None:
            # assume scaler was fit on numeric columns only
            numeric_cols = preproc.get('feature_names')
            # try to extract numeric subset and scale
            # Note: this template assumes the same feature ordering expected by the model
            # For robust deployment, retrain and save a scikit-learn Pipeline that includes preprocessing.
            try:
                num_df = df.select_dtypes(include=['number'])
                df[num_df.columns] = scaler.transform(num_df)
            except Exception:
                pass
    return df


def predict_fn(*args):
    # Build single-row DataFrame
    row = pd.DataFrame([dict(zip(feature_order, args))])
    row_proc = preprocess_input(row)
    pred = model.predict(row_proc)[0]
    prob = None
    if hasattr(model, 'predict_proba'):
        prob = model.predict_proba(row_proc)[0]
        prob = {c: float(p) for c,p in zip(classes, prob)}
    return str(pred), prob

# NOTE: Customize the inputs list below to match feature types (Dropdown vs Number)
inputs = [gr.Textbox(label=f) for f in feature_order]
outputs = [gr.Label(num_top_classes=1), gr.JSON()]

iface = gr.Interface(fn=predict_fn, inputs=inputs, outputs=outputs, title='Obesity Level Prediction')

if __name__ == '__main__':
    iface.launch()
"""

with open('pipelines/task3.py','w', encoding='utf-8') as f:
    f.write(app_template)

print('Wrote pipelines/task3.py template for Gradio deployment')