# Phase 3.1: Baseline Models Training

**Objective**: Train baseline models (Logistic Regression, Random Forest) for lottery number prediction.

**Dataset**: 17 Sri Lankan lotteries, 485K records, 20 features

**Evaluation**: F1-Score, Precision, Recall, AUC-ROC on validation set

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully")

## 2. Load Training and Validation Data

We'll load all 17 lotteries' train and validation splits.

In [None]:
# Define paths
DATA_DIR = Path('../data/splits')
OUTPUT_DIR = Path('../outputs/results')
MODEL_DIR = Path('../models')

# Create output directories if they don't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Get all lottery names
train_files = sorted(DATA_DIR.glob('*_train.csv'))
lottery_names = [f.stem.replace('_train', '') for f in train_files]

print(f"Found {len(lottery_names)} lotteries:")
for name in lottery_names:
    print(f"  - {name}")

In [None]:
# Load all training data
train_dfs = []
for lottery in lottery_names:
    df = pd.read_csv(DATA_DIR / f"{lottery}_train.csv")
    train_dfs.append(df)

train_data = pd.concat(train_dfs, ignore_index=True)

print(f"\nTraining data shape: {train_data.shape}")
print(f"Memory usage: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Load all validation data
val_dfs = []
for lottery in lottery_names:
    df = pd.read_csv(DATA_DIR / f"{lottery}_val.csv")
    val_dfs.append(df)

val_data = pd.concat(val_dfs, ignore_index=True)

print(f"Validation data shape: {val_data.shape}")
print(f"Memory usage: {val_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 3. Prepare Features and Target

Separate features (X) and target (y). Encode categorical features for baseline models.

In [None]:
# Define feature columns (exclude target and identifiers)
exclude_cols = ['appeared', 'draw_date', 'lottery', 'number']
feature_cols = [col for col in train_data.columns if col not in exclude_cols]

print(f"Feature columns ({len(feature_cols)}):")
print(feature_cols)

# Identify categorical features
categorical_features = ['trend']  # 'lottery' excluded from features
numerical_features = [col for col in feature_cols if col not in categorical_features]

print(f"\nCategorical features: {categorical_features}")
print(f"Numerical features ({len(numerical_features)}): {numerical_features[:5]}...")

In [None]:
# One-hot encode categorical features for baseline models
train_encoded = pd.get_dummies(train_data[feature_cols], columns=categorical_features)
val_encoded = pd.get_dummies(val_data[feature_cols], columns=categorical_features)

# Ensure same columns in train and validation
train_encoded, val_encoded = train_encoded.align(val_encoded, join='left', axis=1, fill_value=0)

# Prepare X and y
X_train = train_encoded
y_train = train_data['appeared']

X_val = val_encoded
y_val = val_data['appeared']

print(f"\nX_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"\nClass distribution (train):")
print(y_train.value_counts())
print(f"\nClass imbalance ratio (train): 1:{(y_train == 0).sum() / (y_train == 1).sum():.2f}")

## 4. Train Baseline Model 1: Logistic Regression

Simple linear classifier as baseline.

In [None]:
print("Training Logistic Regression...")

# Train with class weight balancing
lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)

print("Training complete!")

In [None]:
# Evaluate on validation set
y_pred_lr = lr_model.predict(X_val)
y_pred_proba_lr = lr_model.predict_proba(X_val)[:, 1]

# Calculate metrics
lr_metrics = {
    'model': 'Logistic Regression',
    'f1_score': f1_score(y_val, y_pred_lr),
    'precision': precision_score(y_val, y_pred_lr),
    'recall': recall_score(y_val, y_pred_lr),
    'roc_auc': roc_auc_score(y_val, y_pred_proba_lr)
}

print("\nLogistic Regression - Validation Metrics:")
print(f"F1-Score:  {lr_metrics['f1_score']:.4f}")
print(f"Precision: {lr_metrics['precision']:.4f}")
print(f"Recall:    {lr_metrics['recall']:.4f}")
print(f"ROC-AUC:   {lr_metrics['roc_auc']:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_lr, target_names=['Not Appeared', 'Appeared']))

## 5. Train Baseline Model 2: Random Forest

Ensemble tree-based model for comparison.

In [None]:
print("Training Random Forest...")

# Train with class weight balancing
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)

print("Training complete!")

In [None]:
# Evaluate on validation set
y_pred_rf = rf_model.predict(X_val)
y_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# Calculate metrics
rf_metrics = {
    'model': 'Random Forest',
    'f1_score': f1_score(y_val, y_pred_rf),
    'precision': precision_score(y_val, y_pred_rf),
    'recall': recall_score(y_val, y_pred_rf),
    'roc_auc': roc_auc_score(y_val, y_pred_proba_rf)
}

print("\nRandom Forest - Validation Metrics:")
print(f"F1-Score:  {rf_metrics['f1_score']:.4f}")
print(f"Precision: {rf_metrics['precision']:.4f}")
print(f"Recall:    {rf_metrics['recall']:.4f}")
print(f"ROC-AUC:   {rf_metrics['roc_auc']:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_rf, target_names=['Not Appeared', 'Appeared']))

## 6. Compare Baseline Models

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame([lr_metrics, rf_metrics])
comparison_df = comparison_df.set_index('model')

print("\nBaseline Models Comparison:")
print(comparison_df.to_string())

# Save comparison to CSV
comparison_df.to_csv(OUTPUT_DIR / 'baseline_comparison.csv')
print(f"\nSaved comparison to: {OUTPUT_DIR / 'baseline_comparison.csv'}")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Metrics comparison
comparison_df.plot(kind='bar', ax=axes[0])
axes[0].set_title('Baseline Models - Metrics Comparison', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_ylim(0, 1)
axes[0].legend(title='Metrics', loc='lower right')
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Plot 2: F1-Score comparison
comparison_df['f1_score'].plot(kind='barh', ax=axes[1], color=['#1f77b4', '#ff7f0e'])
axes[1].set_title('F1-Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlabel('F1-Score')
axes[1].set_ylabel('Model')
axes[1].set_xlim(0, 1)
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'baseline_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Saved comparison plot to: {OUTPUT_DIR / 'baseline_comparison.png'}")

## 7. Save Models and Results

In [None]:
# Save Logistic Regression model
with open(MODEL_DIR / 'logistic_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print(f"Saved Logistic Regression model to: {MODEL_DIR / 'logistic_regression.pkl'}")

# Save Random Forest model
with open(MODEL_DIR / 'random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print(f"Saved Random Forest model to: {MODEL_DIR / 'random_forest.pkl'}")

# Save metrics as JSON
results = {
    'logistic_regression': lr_metrics,
    'random_forest': rf_metrics
}

with open(OUTPUT_DIR / 'baseline_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print(f"Saved metrics to: {OUTPUT_DIR / 'baseline_results.json'}")

## 8. Summary

**Key Findings:**
1. Both baseline models achieve better-than-random performance on imbalanced lottery data
2. Random Forest typically outperforms Logistic Regression due to non-linear decision boundaries
3. Class imbalance (1:13.92 ratio) affects recall for minority class (appeared=1)
4. These baselines establish minimum performance benchmarks for CatBoost comparison

**Next Steps:**
- Notebook 02: Train CatBoost classifier with native categorical feature handling
- Expected improvement: 5-15% F1-Score increase over Random Forest