# Notebook 3: Inference and Evaluation

This notebook demonstrates the final two stages of the project: generating recommended number sets using the trained ensemble model (inference) and evaluating the model's performance on unseen historical data.

### 1. Setup and Imports

In [None]:
import torch
import pandas as pd
import numpy as np
import joblib
from tqdm.notebook import tqdm
import random
import os
import sys

# Add the source directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

# Import all necessary components from our project
from src.config import CONFIG
from src.model import ScoringModel
from src.feature_engineering import FeatureEngineer
from src.temporal_scorer import TemporalScorer
from src.i_ching_scorer import IChingScorer
from src.inference_pipeline import ScorerEnsemble, local_search

print("Setup complete. Modules loaded.")

### 2. Load Trained Artifacts and Data

Before we can generate or evaluate, we need to load the saved `ScoringModel` and the fitted `FeatureEngineer`. We also load the historical data to initialize the heuristic scorers.

In [None]:
# Check if model artifacts exist
model_path = os.path.join('..', CONFIG["model_save_path"])
fe_path = os.path.join('..', CONFIG["feature_engineer_path"])

if not os.path.exists(model_path) or not os.path.exists(fe_path):
    print("Model artifacts not found! Please run the training pipeline first.")
else:
    # Load artifacts
    device = "cuda" if torch.cuda.is_available() else "cpu"
    CONFIG['device'] = device

    feature_engineer = joblib.load(fe_path)
    CONFIG['d_features'] = len(feature_engineer.transform([1,2,3,4,5,6], 0))

    model = ScoringModel(CONFIG).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    
    # Load data for scorers
    col_names = [
        'Draw', 'Date', 'Winning_Num_1', 'Winning_Num_2', 'Winning_Num_3',
        'Winning_Num_4', 'Winning_Num_5', 'Winning_Num_6', 'Extra_Num',
        'From_Last', 'Low', 'High', 'Odd', 'Even', '1-10', '11-20', '21-30',
        '31-40', '41-50', 'Div_1_Winners', 'Div_1_Prize', 'Div_2_Winners',
        'Div_2_Prize', 'Div_3_Winners', 'Div_3_Prize', 'Div_4_Winners',
        'Div_4_Prize', 'Div_5_Winners', 'Div_5_Prize', 'Div_6_Winners',
        'Div_6_Prize', 'Div_7_Winners', 'Div_7_Prize', 'Turnover'
    ]
    data_path = os.path.join('..', CONFIG["data_path"])
    df = pd.read_csv(data_path, header=None, skiprows=33, names=col_names)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date').reset_index(drop=True)

    print("Model, FeatureEngineer, and data loaded successfully.")

### 3. Inference: Generating Number Sets

Here, we'll initialize the full `ScorerEnsemble` and use the `local_search` algorithm to generate a few recommended sets.

In [None]:
# Initialize the heuristic scorers
temporal_scorer = TemporalScorer(CONFIG)
temporal_scorer.fit(df)
i_ching_scorer = IChingScorer(CONFIG)

# --- GENERATE WITHOUT I-CHING ---
print("--- Generating with DL + Temporal Ensemble ---")
ensemble_no_iching = ScorerEnsemble(model, feature_engineer, temporal_scorer, i_ching_scorer, df, CONFIG, use_i_ching=False)
initial_set_1 = sorted(random.sample(range(1, CONFIG['num_lotto_numbers'] + 1), 6))
best_set_1, best_score_1 = local_search(
    initial_set_1, ensemble_no_iching, 
    max_iterations=CONFIG['search_iterations'],
    num_neighbors=CONFIG['search_neighbors']
)
print(f"Generated Set: {best_set_1} (Score: {best_score_1:.4f})\n")

# --- GENERATE WITH I-CHING ---
print("--- Generating with DL + Temporal + I-Ching Ensemble ---")
ensemble_with_iching = ScorerEnsemble(model, feature_engineer, temporal_scorer, i_ching_scorer, df, CONFIG, use_i_ching=True)
initial_set_2 = sorted(random.sample(range(1, CONFIG['num_lotto_numbers'] + 1), 6))
best_set_2, best_score_2 = local_search(
    initial_set_2, ensemble_with_iching, 
    max_iterations=CONFIG['search_iterations'],
    num_neighbors=CONFIG['search_neighbors']
)
print(f"Generated Set: {best_set_2} (Score: {best_score_2:.4f})")

### 4. Evaluation: Measuring Model Performance

Finally, we run the evaluation logic to see how well our model performs on the validation set. We calculate the "win rate" — the percentage of times the model scores a real winning set higher than a random set. A score above 50% indicates positive performance.

In [None]:
# Use the same ScorerEnsemble instance from the inference step
print("Evaluating the model on the validation set...")

# Split data
train_size = int(len(df) * 0.85)
val_df = df.iloc[train_size:].reset_index()

wins = 0
total_comparisons = 0
winning_num_cols = [f'Winning_Num_{i}' for i in range(1, 7)]

# Note: Using a smaller number of negative samples for faster notebook execution
evaluation_neg_samples = 19 # Compare against 19 random sets

for idx, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Evaluating Draws"):
    positive_set = row[winning_num_cols].astype(int).tolist()
    positive_score = ensemble_with_iching.score(positive_set)

    for _ in range(evaluation_neg_samples):
        while True:
            negative_set = sorted(random.sample(range(1, CONFIG['num_lotto_numbers'] + 1), 6))
            if tuple(negative_set) not in feature_engineer.historical_sets:
                break
        
        negative_score = ensemble_with_iching.score(negative_set)
        
        if positive_score > negative_score:
            wins += 1
        total_comparisons += 1

win_rate = (wins / total_comparisons) * 100 if total_comparisons > 0 else 0

print("\n--- Evaluation Results ---")
print(f"Positive (Winning) sets ranked higher: {wins}/{total_comparisons}")
print(f"Model Win Rate: {win_rate:.2f}%")
print("----------------------------")