In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

from preprocessing.preprocess import load_csv, clean_data, preprocess_for_model

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('darkgrid')
%matplotlib inline

print("Libraries imported successfully")

## Load & Inspect Dataset

In [None]:
# Load your CSV dataset
# Update path to your dataset
dataset_path = '../datasets/sample_toy.csv'

df = load_csv(dataset_path)

print(f"Dataset loaded from: {dataset_path}")
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")

In [None]:
# Display first few rows
print("First 5 rows:")
print(df.head())

print(f"\nLast 5 rows:")
print(df.tail())

In [None]:
# Data types and info
print("Data types:")
print(df.dtypes)

print(f"\nMemory usage:")
print(df.memory_usage())

In [None]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values:")
print(missing[missing > 0])

if missing.sum() == 0:
    print("No missing values found!")
else:
    missing_pct = (missing / len(df) * 100).round(2)
    print(f"\nMissing percentage:")
    print(missing_pct[missing_pct > 0])

In [None]:
# Summary statistics
print("Summary statistics:")
print(df.describe())

## Data Quality Analysis

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"Duplicate percentage: {(duplicates / len(df) * 100):.2f}%")

In [None]:
# Data type distribution
print("Data type distribution:")
print(df.dtypes.value_counts())

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

## Explore Target Variable

In [None]:
# Label distribution
print("Label distribution:")
label_counts = df['label'].value_counts()
print(label_counts)

print(f"\nLabel percentages:")
print(df['label'].value_counts(normalize=True).round(4) * 100)

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
df['label'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Label Distribution (Count)')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y')

# Pie chart
df['label'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
axes[1].set_title('Label Distribution (Percentage)')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Check class imbalance
label_counts = df['label'].value_counts().sort_values()
imbalance_ratio = label_counts.max() / label_counts.min()
print(f"Class imbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio > 3:
    print("Warning: Dataset is imbalanced. Consider using class weights or resampling.")
else:
    print("Dataset is well-balanced.")

## Feature Analysis

In [None]:
# Correlation with target
if 'label' in numeric_cols:
    corr_with_label = df[numeric_cols].corr()['label'].drop('label').sort_values(ascending=False)
    print("Feature correlation with label:")
    print(corr_with_label)
else:
    # For binary/categorical labels, calculate point-biserial correlation
    from scipy.stats import pointbiserialr
    numeric_only = df[numeric_cols]
    corr_dict = {}
    for col in numeric_cols:
        corr, _ = pointbiserialr(df['label'], df[col])
        corr_dict[col] = abs(corr)
    corr_series = pd.Series(corr_dict).sort_values(ascending=False)
    print("Feature correlation with label (absolute):")
    print(corr_series)

In [None]:
# Visualize top feature correlations
if 'label' in numeric_cols:
    corr_with_label = df[numeric_cols].corr()['label'].drop('label').sort_values()
    fig, ax = plt.subplots(figsize=(10, 6))
    corr_with_label.plot(kind='barh', ax=ax, color='steelblue')
    ax.set_title('Feature Correlation with Label')
    ax.set_xlabel('Correlation Coefficient')
    ax.grid(axis='x')
    plt.tight_layout()
    plt.show()

## Clean Data

In [None]:
# Apply preprocessing
df_clean = clean_data(df)

print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean)}")
print(f"Rows removed %: {((len(df) - len(df_clean)) / len(df) * 100):.2f}%")

## Prepare for Model Training

In [None]:
# Create train/val/test splits with preprocessing
prep = preprocess_for_model(df_clean, label_col='label')

print("Preprocessing complete!")
print("\nData split information:")
print(f"Training   - X: {prep['X_train'].shape}, y: {prep['y_train'].shape}")
print(f"Validation - X: {prep['X_val'].shape}, y: {prep['y_val'].shape}")
print(f"Test       - X: {prep['X_test'].shape}, y: {prep['y_test'].shape}")
print(f"\nFeatures used: {prep['feature_columns']}")
print(f"Scaler: {type(prep['scaler']).__name__}")

In [None]:
# Verify no data leakage and splits make sense
X_train, y_train = prep['X_train'], prep['y_train']
X_val, y_val = prep['X_val'], prep['y_val']
X_test, y_test = prep['X_test'], prep['y_test']

total_samples = len(y_train) + len(y_val) + len(y_test)
print(f"Total samples accounted for: {total_samples} (original: {len(df_clean)})")

print(f"\nTrain/Val/Test split:")
print(f"  Train: {(len(y_train)/len(df_clean)*100):.1f}%")
print(f"  Val:   {(len(y_val)/len(df_clean)*100):.1f}%")
print(f"  Test:  {(len(y_test)/len(df_clean)*100):.1f}%")

print(f"\nLabel distribution in splits:")
print(f"  Train - Class 0: {(y_train == 0).sum()}, Class 1: {(y_train == 1).sum()}")
print(f"  Val   - Class 0: {(y_val == 0).sum()}, Class 1: {(y_val == 1).sum()}")
print(f"  Test  - Class 0: {(y_test == 0).sum()}, Class 1: {(y_test == 1).sum()}")

In [None]:
# Verify data preprocessing (scaling)
print(f"Training data statistics (after scaling):")
print(f"  Mean: {X_train.mean(axis=0)[:5]}... (first 5 features)")
print(f"  Std:  {X_train.std(axis=0)[:5]}... (first 5 features)")
print(f"  Min:  {X_train.min(axis=0)[:5]}... (first 5 features)")
print(f"  Max:  {X_train.max(axis=0)[:5]}... (first 5 features)")

## Save Preprocessing Artifacts

In [None]:
import joblib

# Save scaler for inference
os.makedirs('models', exist_ok=True)

scaler_path = 'models/scaler_sample.pkl'
joblib.dump(prep['scaler'], scaler_path)

print(f"Scaler saved to: {scaler_path}")
print(f"Feature columns: {prep['feature_columns']}")