# Obesity Classification Project

This notebook covers:
1. Exploratory Data Analysis (EDA)
2. Data Preprocessing
3. Model Training (Logistic Regression, Random Forest, SVM, Gradient Boosting)
4. Model Evaluation
5. Saving the Best Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [None]:
# Load Data
try:
    df = pd.read_csv('Obesity Classification.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the path.")

df.head()

In [None]:
# Data Info
df.info()

In [None]:
# Statistical Summary
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

## Visualizations

In [None]:
# Distribution of Target Variable
plt.figure(figsize=(10,6))
sns.countplot(x='Label', data=df)
plt.title('Distribution of Obesity Levels')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Age Distribution
plt.figure(figsize=(10,6))
sns.histplot(df['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Preprocessing

In [None]:
# Encoding Categorical Variables
le_gender = LabelEncoder()
if 'Gender' in df.columns:
    df['Gender'] = le_gender.fit_transform(df['Gender'])

le_label = LabelEncoder()
if 'Label' in df.columns:
    df['Label'] = le_label.fit_transform(df['Label'])
    print("Classes:", le_label.classes_)

df.head()

In [None]:
# Splitting Data
X = df.drop('Label', axis=1)
y = df['Label']

if 'ID' in X.columns:
    X = X.drop('ID', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

## Model Training and Evaluation

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

best_model = None
best_score = 0
best_model_name = ""

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    if acc > best_score:
        best_score = acc
        best_model = model
        best_model_name = name
        
print(f"\nBest Model: {best_model_name} with Accuracy: {best_score:.4f}")

In [None]:
# Confusion Matrix for Best Model
y_pred_best = best_model.predict(X_test)
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix: {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Save Model and Scaler
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Best model and scaler saved successfully!")