In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath('..'))
from src.utils.seed import set_seed
from src.utils.config import Config

set_seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
config = Config('../configs/config.yaml')

data_path = config.data['processed_data_path']
print(f"Loading data from: {data_path}")

df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nSummary statistics:")
df.describe()

In [None]:
target_col = config.data['target_col']
print(f"Target distribution ({target_col}):")
print(df[target_col].value_counts())
print(f"\nClass balance:")

plt.figure(figsize=(6, 4))
df[target_col].value_counts().plot(kind='bar')
plt.title('Target Distribution')
plt.xlabel(target_col)
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
numerical_features = config.data['numerical_features']
categorical_features = config.data['categorical_features']

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

In [None]:
if numerical_features:
    fig, axes = plt.subplots(1, len(numerical_features), figsize=(15, 4))
    if len(numerical_features) == 1:
        axes = [axes]
    
    for idx, feat in enumerate(numerical_features):
        df[feat].hist(bins=30, ax=axes[idx])
        axes[idx].set_title(f'{feat} Distribution')
        axes[idx].set_xlabel(feat)
        axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
if categorical_features:
    fig, axes = plt.subplots(1, len(categorical_features), figsize=(12, 4))
    if len(categorical_features) == 1:
        axes = [axes]
    
    for idx, feat in enumerate(categorical_features):
        df[feat].value_counts().plot(kind='bar', ax=axes[idx])
        axes[idx].set_title(f'{feat} Distribution')
        axes[idx].set_xlabel(feat)
        axes[idx].set_ylabel('Count')
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
if numerical_features and len(numerical_features) > 1:
    correlation_matrix = df[numerical_features].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

In [None]:
print("âœ“ Data exploration complete!")
print(f"Dataset ready for training: {df.shape[0]} samples, {df.shape[1]} features")