# Task 1: Exploratory Data Analysis (EDA)
This notebook performs exploratory data analysis on the Obesity Dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Inspect Data

In [None]:
# Load the dataset
df = pd.read_csv('ObesityDataset.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

## 2. Target Variable Analysis

In [None]:
# Analyze target variable (NObeyesdad)
print("Target Variable Distribution:")
print(df['NObeyesdad'].value_counts())
print("\nTarget Variable Percentages:")
print(df['NObeyesdad'].value_counts(normalize=True) * 100)

# Visualize target variable distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df['NObeyesdad'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Obesity Level Distribution')
plt.xlabel('Obesity Level')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
df['NObeyesdad'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Obesity Level Percentage')
plt.ylabel('')
plt.tight_layout()
plt.show()

## 3. Categorical Features Analysis

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('NObeyesdad')  # Remove target variable

print("Categorical Features:")
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

# Visualize categorical features
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    if idx < len(axes):
        df[col].value_counts().plot(kind='bar', ax=axes[idx], color='coral')
        axes[idx].set_title(f'{col} Distribution')
        axes[idx].set_xlabel('')
        axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Numerical Features Analysis

In [None]:
# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Numerical Features:")
print(df[numerical_cols].describe())

# Distribution of numerical features
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if idx < len(axes):
        axes[idx].hist(df[col], bins=20, color='lightgreen', edgecolor='black')
        axes[idx].set_title(f'{col} Distribution')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

print("\nHighly correlated features (|correlation| > 0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:
            print(f"{correlation_matrix.columns[i]} - {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

## 6. Relationship between Features and Target

In [None]:
# Age distribution by obesity level
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='NObeyesdad', y='Age', palette='Set2')
plt.title('Age Distribution by Obesity Level')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Gender distribution by obesity level
plt.figure(figsize=(12, 6))
pd.crosstab(df['NObeyesdad'], df['Gender']).plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Gender Distribution by Obesity Level')
plt.xlabel('Obesity Level')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 7. Key Insights and Summary

In [None]:
print("=" * 80)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 80)
print(f"\n1. Dataset Size: {df.shape[0]} samples, {df.shape[1]} features")
print(f"\n2. Target Classes: {df['NObeyesdad'].nunique()} obesity levels")
print(f"   - Most common: {df['NObeyesdad'].value_counts().index[0]}")
print(f"   - Least common: {df['NObeyesdad'].value_counts().index[-1]}")
print(f"\n3. Data Quality: {df.isnull().sum().sum()} missing values")
print(f"\n4. Feature Types:")
print(f"   - Categorical: {len(categorical_cols)} features")
print(f"   - Numerical: {len(numerical_cols)} features")
print(f"\n5. Age Range: {df['Age'].min():.0f} to {df['Age'].max():.0f} years")
print(f"   - Mean Age: {df['Age'].mean():.1f} years")
print("\n" + "=" * 80)