# Workshop 4: ML Simulation

This notebook demonstrates the Data-driven ML Simulation (Scenario 1).

In [None]:
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.ingestion import load_data
from src.preprocessing import preprocess_data
from src.features import create_features
from src.models import train_model, evaluate_model

%matplotlib inline

## Load Configuration

In [None]:
with open('../config/ml_config.yaml', 'r') as f:
    config = yaml.safe_load(f)
config['data_path'] = '../src/Data/census_starter.csv' # Adjust path for notebook

## Data Ingestion & Preprocessing

In [None]:
df = load_data(config['data_path'])
df = preprocess_data(df, config)
df = create_features(df)
df.head()

## Model Training

In [None]:
from sklearn.model_selection import train_test_split

target_col = 'microbusiness_density'
if target_col not in df.columns:
    target_col = df.columns[-1]

X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config['random_seed'])

model = train_model(X_train, y_train, config, model_type='random_forest')

## Evaluation

In [None]:
metrics, preds = evaluate_model(model, X_test, y_test)
print(metrics)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=preds)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title("Actual vs Predicted")
plt.show()