# 📊 Phase 2: Exploratory Data Analysis (EDA)

This notebook demonstrates comprehensive exploratory data analysis techniques for MLOps, covering data exploration, selection, filtering, and visualization.

## Table of Contents
1. [Exploratory Data Analysis](#1-exploratory-data-analysis)
2. [Data Selection & Filtering](#2-data-selection--filtering)
3. [Data Visualization](#3-data-visualization)

---

## Prerequisites
Make sure you have the required libraries installed:
```bash
pip install pandas numpy matplotlib seaborn plotly scipy scikit-learn
```


## 1. Exploratory Data Analysis

**Purpose**: Understand data characteristics, patterns, and potential issues before modeling.


In [None]:
# 1.1 Basic Data Summary
print("🔍 Step 7: Exploratory Data Analysis")
print("=" * 50)

# Basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)

# Statistical summary
print("\nStatistical Summary:")
print(df.describe())

# Memory usage
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Missing values analysis
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print("\nMissing Values Analysis:")
print(missing_df[missing_df['Missing Count'] > 0])

# Data quality summary
print(f"\nData Quality Summary:")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
# 1.2 Distribution Analysis
print("\n📊 Distribution Analysis")
print("-" * 30)

# Numerical columns distribution
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(f"Numerical columns: {list(numerical_cols)}")

# Create distribution plots for numerical columns
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols[:6]):
    if i < len(axes):
        df[col].hist(bins=30, ax=axes[i], alpha=0.7, edgecolor='black')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Categorical columns analysis
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(f"\nCategorical columns: {list(categorical_cols)}")

for col in categorical_cols:
    print(f"\n{col} value counts:")
    print(df[col].value_counts().head(10))
    print(f"Unique values: {df[col].nunique()}")


In [None]:
# 1.3 Correlation Analysis
print("\n🔗 Correlation Analysis")
print("-" * 30)

# Correlation matrix
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Pairwise correlations
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:  # High correlation threshold
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                corr_val
            ))

print("High Correlation Pairs:")
for pair in high_corr_pairs:
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")

# 1.4 Outlier Detection
print("\n🔍 Outlier Detection")
print("-" * 30)

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Detect outliers in numerical columns
outlier_summary = {}
for col in numerical_cols:
    outliers = detect_outliers_iqr(df, col)
    outlier_summary[col] = {
        'count': len(outliers),
        'percentage': (len(outliers) / len(df)) * 100
    }

print("Outlier Summary:")
for col, info in outlier_summary.items():
    print(f"{col}: {info['count']} outliers ({info['percentage']:.2f}%)")


## 2. Data Selection & Filtering

**Purpose**: Create targeted datasets for specific analysis and model training.


In [None]:
# 2.1 Data Selection Techniques
print("✂️ Step 8: Data Selection & Filtering")
print("=" * 50)

# Select specific columns
selected_columns = ['age', 'income', 'education', 'default_risk']
df_selected = df[selected_columns]
print(f"Selected columns: {selected_columns}")
print(f"Selected dataset shape: {df_selected.shape}")

# Select rows based on conditions
df_filtered = df[df['age'] > 18]
print(f"\nFiltered by age > 18: {df_filtered.shape[0]} rows")

df_high_income = df[df['income'] > df['income'].quantile(0.8)]
print(f"High income (top 20%): {df_high_income.shape[0]} rows")

# Multiple conditions
df_filtered = df[(df['age'] > 18) & (df['income'] > 50000) & (df['education'] == 'Bachelor')]
print(f"Age > 18 AND income > 50k AND Bachelor's: {df_filtered.shape[0]} rows")

# Using .loc[] for label-based selection
df_subset = df.loc[df['employment_status'] == 'Employed', ['age', 'income', 'credit_score']]
print(f"Employed customers subset: {df_subset.shape}")

# Using .iloc[] for position-based selection
df_subset = df.iloc[0:100, 0:5]  # First 100 rows, first 5 columns
print(f"First 100 rows, first 5 columns: {df_subset.shape}")

# 2.2 Advanced Filtering
print("\n🔍 Advanced Filtering Techniques")
print("-" * 30)

# Filter by data type
numerical_data = df.select_dtypes(include=[np.number])
categorical_data = df.select_dtypes(include=['object', 'category'])
print(f"Numerical data shape: {numerical_data.shape}")
print(f"Categorical data shape: {categorical_data.shape}")

# Filter by null values
df_no_nulls = df.dropna()
print(f"Rows with no nulls: {df_no_nulls.shape[0]}")

# Filter by string patterns
if 'city' in df.columns:
    df_ny_customers = df[df['city'].str.contains('New York', na=False)]
    print(f"New York customers: {df_ny_customers.shape[0]}")

# Filter by date ranges (if date column exists)
if 'application_date' in df.columns:
    df_recent = df[df['application_date'] >= '2020-06-01']
    print(f"Recent applications (after June 2020): {df_recent.shape[0]}")

# Filter by quantiles
df_top_10_percent = df[df['income'] >= df['income'].quantile(0.9)]
print(f"Top 10% income: {df_top_10_percent.shape[0]} rows")


## 3. Data Visualization

**Purpose**: Create visual representations to understand data patterns and relationships.


In [None]:
# 3.1 Basic Plotting with Matplotlib
print("📊 Step 9: Data Visualization")
print("=" * 50)

# Line plots for time series (if date column exists)
if 'application_date' in df.columns:
    plt.figure(figsize=(12, 6))
    daily_counts = df.groupby(df['application_date'].dt.date).size()
    plt.plot(daily_counts.index, daily_counts.values)
    plt.title('Applications Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Applications')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.show()

# Bar plots for categorical data
plt.figure(figsize=(10, 6))
df['education'].value_counts().plot(kind='bar')
plt.title('Education Distribution')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Histograms for numerical data
plt.figure(figsize=(12, 8))
df['age'].hist(bins=30, alpha=0.7, edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# 3.2 Advanced Visualization with Seaborn
print("\n🎨 Advanced Visualizations")
print("-" * 30)

# Scatter plots with regression
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='age', y='income', hue='education')
plt.title('Age vs Income by Education')
plt.show()

# Box plots for outlier detection
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='education', y='income')
plt.title('Income Distribution by Education')
plt.xticks(rotation=45)
plt.show()

# Violin plots for distribution shape
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='employment_status', y='credit_score')
plt.title('Credit Score Distribution by Employment Status')
plt.xticks(rotation=45)
plt.show()

# Heatmap for correlation
plt.figure(figsize=(10, 8))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

# 3.3 Interactive Visualizations with Plotly
print("\n🔄 Interactive Visualizations")
print("-" * 30)

# Interactive scatter plot
fig = px.scatter(df, x='age', y='income', color='education', 
                 title='Interactive Age vs Income by Education')
fig.show()

# Interactive histogram
fig = px.histogram(df, x='credit_score', color='default_risk', 
                   title='Credit Score Distribution by Default Risk')
fig.show()

# Interactive box plot
fig = px.box(df, x='education', y='income', 
             title='Income Distribution by Education Level')
fig.show()

print("✅ Data visualization completed successfully!")


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

print("✅ Libraries imported successfully!")
print(f"📅 Current date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Load data from Phase 1 (if available)
try:
    # Try to load processed data from Phase 1
    df = pd.read_parquet('data/processed/dataset_v*.parquet')
    print("✅ Loaded processed data from Phase 1")
except:
    # Create sample data if Phase 1 data not available
    print("⚠️  Phase 1 data not found, creating sample data...")
    
    # Create sample dataset
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'customer_id': range(1, n_samples + 1),
        'age': np.random.normal(35, 12, n_samples).astype(int),
        'income': np.random.lognormal(10, 0.5, n_samples),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
        'employment_status': np.random.choice(['Employed', 'Unemployed', 'Self-employed', 'Retired'], n_samples, p=[0.6, 0.1, 0.2, 0.1]),
        'credit_score': np.random.normal(650, 100, n_samples).astype(int),
        'loan_amount': np.random.exponential(50000, n_samples),
        'loan_duration': np.random.choice([12, 24, 36, 48, 60], n_samples, p=[0.2, 0.3, 0.3, 0.15, 0.05]),
        'interest_rate': np.random.normal(8.5, 2, n_samples),
        'default_risk': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'application_date': pd.date_range('2020-01-01', periods=n_samples, freq='D'),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_samples, p=[0.2, 0.15, 0.15, 0.15, 0.35]),
        'marital_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], n_samples, p=[0.4, 0.4, 0.15, 0.05]),
        'dependents': np.random.poisson(1.5, n_samples),
        'previous_loans': np.random.poisson(2, n_samples),
        'late_payments': np.random.poisson(0.5, n_samples),
        'debt_to_income_ratio': np.random.beta(2, 5, n_samples),
        'credit_utilization': np.random.beta(3, 2, n_samples),
        'home_ownership': np.random.choice(['Rent', 'Own', 'Mortgage'], n_samples, p=[0.4, 0.2, 0.4]),
        'purpose': np.random.choice(['Debt Consolidation', 'Home Improvement', 'Business', 'Education'], n_samples, p=[0.4, 0.2, 0.2, 0.2])
    }
    
    df = pd.DataFrame(data)
    print("✅ Sample dataset created")

print(f"\n📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
