# Task 2: Model Training
This notebook implements data preprocessing, feature engineering, and model training for obesity classification.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load and Prepare Data

In [None]:
# Load the dataset
df = pd.read_csv('ObesityDataset.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nTarget classes: {df['NObeyesdad'].unique()}")
print(f"\nClass distribution:")
print(df['NObeyesdad'].value_counts())

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Encode target variable
label_encoder_target = LabelEncoder()
df_processed['NObeyesdad_encoded'] = label_encoder_target.fit_transform(df_processed['NObeyesdad'])

# Store the mapping
target_mapping = dict(zip(label_encoder_target.classes_, label_encoder_target.transform(label_encoder_target.classes_)))
print("Target variable encoding:")
for original, encoded in sorted(target_mapping.items(), key=lambda x: x[1]):
    print(f"  {encoded}: {original}")

# Separate features and target
X = df_processed.drop(['NObeyesdad', 'NObeyesdad_encoded'], axis=1)
y = df_processed['NObeyesdad_encoded']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

### 2.1 Encode Categorical Features

In [None]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Encode categorical features
label_encoders = {}
X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    label_encoders[col] = le
    print(f"\n{col} encoding:")
    for original, encoded in zip(le.classes_, le.transform(le.classes_)):
        print(f"  {encoded}: {original}")

print(f"\nEncoded feature matrix shape: {X_encoded.shape}")
print("\nFirst few rows after encoding:")
print(X_encoded.head())

## 3. Split Data into Training and Testing Sets

In [None]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTesting set class distribution:")
print(y_test.value_counts().sort_index())

## 4. Feature Scaling

In [None]:
# Scale features (important for distance-based algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")
print(f"\nScaled training set shape: {X_train_scaled.shape}")
print(f"Scaled testing set shape: {X_test_scaled.shape}")
print(f"\nFeature means after scaling (should be close to 0):")
print(np.mean(X_train_scaled, axis=0))
print(f"\nFeature stds after scaling (should be close to 1):")
print(np.std(X_train_scaled, axis=0))

## 5. Train Multiple Classification Models

In [None]:
# Define multiple models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Support Vector Machine': SVC(kernel='rbf', random_state=42),
    'Naive Bayes': GaussianNB()
}

print(f"Training {len(models)} models...")
print("=" * 80)

In [None]:
# Train all models and store them
trained_models = {}
training_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for models that benefit from scaling
    if name in ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes']:
        model.fit(X_train_scaled, y_train)
        train_score = model.score(X_train_scaled, y_train)
    else:
        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
    
    trained_models[name] = model
    training_scores[name] = train_score
    
    print(f"  Training accuracy: {train_score:.4f}")

print("\n" + "=" * 80)
print("All models trained successfully!")

## 6. Training Accuracy Comparison

In [None]:
# Create a summary of training accuracies
training_results = pd.DataFrame({
    'Model': list(training_scores.keys()),
    'Training Accuracy': list(training_scores.values())
}).sort_values('Training Accuracy', ascending=False)

print("\nTraining Accuracy Summary:")
print("=" * 60)
print(training_results.to_string(index=False))
print("=" * 60)

# Visualize training accuracies
plt.figure(figsize=(12, 6))
plt.barh(training_results['Model'], training_results['Training Accuracy'], color='skyblue')
plt.xlabel('Training Accuracy')
plt.title('Model Training Accuracy Comparison')
plt.xlim([0, 1])
for i, v in enumerate(training_results['Training Accuracy']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')
plt.tight_layout()
plt.show()

## 7. Save Models and Preprocessing Objects

In [None]:
# Save models and preprocessing objects for use in Task 3
import pickle

# Save all trained models
with open('trained_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)

# Save preprocessing objects
preprocessing_objects = {
    'label_encoders': label_encoders,
    'label_encoder_target': label_encoder_target,
    'scaler': scaler,
    'target_mapping': target_mapping,
    'feature_names': X_encoded.columns.tolist()
}

with open('preprocessing_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

# Save train/test split for evaluation
split_data = {
    'X_train': X_train,
    'X_test': X_test,
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test
}

with open('split_data.pkl', 'wb') as f:
    pickle.dump(split_data, f)

print("✓ Models saved to 'trained_models.pkl'")
print("✓ Preprocessing objects saved to 'preprocessing_objects.pkl'")
print("✓ Train/test split saved to 'split_data.pkl'")
print("\nAll data ready for evaluation in Task 3!")