In [1]:
# Import cell
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import random

# Configuration cell
INPUT_CONFIG = {
    'transition_matrix_path': Path('data/matrix/transition_matrix.pkl'),
    'features_path': Path('data/features/binned_features.pkl'),
    'output_path': Path('data/predictions'),
    'output_filename': 'predictions.pkl',
    'n_steps_ahead': 3,
    'display_count': 5
}

# Class Definitions
class TransitionMatrix:
    def __init__(self):
        self.state_map = {}
        self.reverse_map = {}
        self.next_state_id = 0
        self.counts = None
        self.probability_matrix = None
    
    def get_state_id(self, features: Tuple) -> int:
        if features not in self.state_map:
            self.state_map[features] = self.next_state_id
            self.reverse_map[self.next_state_id] = features
            self.next_state_id += 1
        return self.state_map[features]

class BayesianPredictor:
    def __init__(self, transition_matrix):
        self.tm = transition_matrix
        
    def predict_next_n_states(self, current_state: int, n_steps: int = 1) -> np.ndarray:
        current_dist = np.zeros(len(self.tm.state_map))
        current_dist[current_state] = 1.0
        
        for _ in range(n_steps):
            current_dist = current_dist @ self.tm.probability_matrix
            
        return current_dist
    
    def get_top_k_states(self, probabilities: np.ndarray, k: int = 3) -> List[Tuple[int, float]]:
        top_indices = np.argsort(probabilities)[-k:][::-1]
        return [(idx, probabilities[idx]) for idx in top_indices]

def predict_linear_region(current_state, pattern_analysis, predictor):
    current_features = predictor.tm.reverse_map[current_state]
    current_pattern = (current_features[1], current_features[2])  # duration, slope
    
    predicted_probs = predictor.predict_next_n_states(current_state)
    initial_predictions = predictor.get_top_k_states(predicted_probs)
    
    weighted_predictions = []
    for state_id, prob in initial_predictions:
        weight = 1.0
        
        if current_pattern in pattern_analysis['success_rate']:
            pattern_stats = pattern_analysis['success_rate'][current_pattern]
            success_rate = pattern_stats['success'] / pattern_stats['total']
            weight = weight * (1 + success_rate)
            
        weighted_predictions.append((state_id, prob * weight))
    
    total_weight = sum(w for _, w in weighted_predictions)
    return [(s, w/total_weight) for s, w in weighted_predictions]

def predict_transition_region(current_state, pattern_analysis, predictor):
    predicted_probs = predictor.predict_next_n_states(current_state)
    predictions = predictor.get_top_k_states(predicted_probs)
    
    # Focus on transitions to linear regions
    weighted_predictions = []
    for state_id, prob in predictions:
        next_features = predictor.tm.reverse_map[state_id]
        # Favor transitions back to linear regions
        weight = 1.5 if next_features[3] == 0 else 1.0
        weighted_predictions.append((state_id, prob * weight))
    
    total_weight = sum(w for _, w in weighted_predictions)
    return [(s, w/total_weight) for s, w in weighted_predictions]

def predict_with_region_awareness(current_state, pattern_analysis, predictor):
    current_features = predictor.tm.reverse_map[current_state]
    is_transition = current_features[3] > 0  # Check curvature
    
    if is_transition:
        return predict_transition_region(current_state, pattern_analysis, predictor)
    else:
        return predict_linear_region(current_state, pattern_analysis, predictor)

def make_predictions(predictor: BayesianPredictor, binned_features: Dict, n_steps: int) -> Dict:
    initial_predictions = make_initial_predictions(predictor, binned_features, n_steps)
    pattern_analysis = analyze_duration_slope_patterns(initial_predictions, predictor)
    
    predictions = {}
    
    for series_id, features in binned_features.items():
        series_predictions = []
        
        for i in range(len(features) - n_steps):
            current_state = predictor.tm.get_state_id((
                features[i]['eodb_bin'],
                features[i]['duration_bin'],
                features[i]['slope_bin'],
                features[i]['curvature_bin']
            ))
            
            actual_future_state = predictor.tm.get_state_id((
                features[i + n_steps]['eodb_bin'],
                features[i + n_steps]['duration_bin'],
                features[i + n_steps]['slope_bin'],
                features[i + n_steps]['curvature_bin']
            ))
            
            top_predictions = predict_with_region_awareness(
                current_state, 
                pattern_analysis,
                predictor
            )
            
            series_predictions.append({
                'current_state': current_state,
                'actual_future_state': actual_future_state,
                'top_predictions': top_predictions
            })
            
        predictions[series_id] = series_predictions
    
    return predictions

def analyze_duration_slope_patterns(predictions, predictor):
    patterns = {
        'transitions': [],
        'success_rate': {},
        'common_errors': [],
        'linear_patterns': {},
        'transition_patterns': {}
    }
    
    for series_id, series_preds in predictions.items():
        for pred in series_preds:
            current = predictor.tm.reverse_map[pred['current_state']]
            actual = predictor.tm.reverse_map[pred['actual_future_state']]
            predicted = predictor.tm.reverse_map[pred['top_predictions'][0][0]]
            
            is_transition = current[3] > 0
            pattern_dict = patterns['transition_patterns'] if is_transition else patterns['linear_patterns']
            
            transition = {
                'from_duration': current[1],
                'from_slope': current[2],
                'to_duration': actual[1],
                'to_slope': actual[2],
                'predicted_duration': predicted[1],
                'predicted_slope': predicted[2],
                'success': (actual[1] == predicted[1] and actual[2] == predicted[2])
            }
            patterns['transitions'].append(transition)
            
            pattern_key = (current[1], current[2])
            if pattern_key not in pattern_dict:
                pattern_dict[pattern_key] = {'total': 0, 'success': 0}
            pattern_dict[pattern_key]['total'] += 1
            if transition['success']:
                pattern_dict[pattern_key]['success'] += 1
            
            if not transition['success']:
                error = {
                    'actual': (actual[1], actual[2]),
                    'predicted': (predicted[1], predicted[2]),
                    'confidence': pred['top_predictions'][0][1]
                }
                patterns['common_errors'].append(error)
    
    return patterns

def make_initial_predictions(predictor: BayesianPredictor, binned_features: Dict, n_steps: int) -> Dict:
    predictions = {}
    
    for series_id, features in binned_features.items():
        series_predictions = []
        
        for i in range(len(features) - n_steps):
            current_state = predictor.tm.get_state_id((
                features[i]['eodb_bin'],
                features[i]['duration_bin'],
                features[i]['slope_bin'],
                features[i]['curvature_bin']
            ))
            
            actual_future_state = predictor.tm.get_state_id((
                features[i + n_steps]['eodb_bin'],
                features[i + n_steps]['duration_bin'],
                features[i + n_steps]['slope_bin'],
                features[i + n_steps]['curvature_bin']
            ))
            
            predicted_probs = predictor.predict_next_n_states(current_state, n_steps)
            top_predictions = predictor.get_top_k_states(predicted_probs)
            
            series_predictions.append({
                'current_state': current_state,
                'actual_future_state': actual_future_state,
                'predicted_probabilities': predicted_probs,
                'top_predictions': top_predictions
            })
            
        predictions[series_id] = series_predictions
    
    return predictions


def analyze_feature_importance(predictions, predictor):
    feature_accuracy = {
        'eodb': {'linear': [], 'transition': []},
        'duration': {'linear': [], 'transition': []},
        'slope': {'linear': [], 'transition': []},
        'curvature': {'linear': [], 'transition': []}
    }
    
    for series_id, series_preds in predictions.items():
        for pred in series_preds:
            actual = pred['actual_future_state']
            predicted = pred['top_predictions'][0][0]
            
            actual_features = predictor.tm.reverse_map[actual]
            predicted_features = predictor.tm.reverse_map[predicted]
            
            region_type = 'transition' if actual_features[3] > 0 else 'linear'
            
            feature_accuracy['eodb'][region_type].append(actual_features[0] == predicted_features[0])
            feature_accuracy['duration'][region_type].append(actual_features[1] == predicted_features[1])
            feature_accuracy['slope'][region_type].append(actual_features[2] == predicted_features[2])
            feature_accuracy['curvature'][region_type].append(actual_features[3] == predicted_features[3])
    
    return {k: {region: np.mean(vals) for region, vals in v.items()} 
            for k, v in feature_accuracy.items()}

# Execution Block
tm_data = pd.read_pickle(INPUT_CONFIG['transition_matrix_path'])
binned_features = pd.read_pickle(INPUT_CONFIG['features_path'])
predictor = BayesianPredictor(tm_data['transition_matrix'])
predictions = make_predictions(predictor, binned_features, INPUT_CONFIG['n_steps_ahead'])

# Analysis and Reporting
print("\nDiagnostic Information:")
series_id = list(predictions.keys())[0]
series_predictions = predictions[series_id]
y_points = [feature['raw_features']['eodb_level'] for feature in binned_features[series_id]]
print("Sample Original EODB values:", y_points[:5])

sample_pred = series_predictions[0]
print("\nSample Prediction:")
print("Current State EODB:", predictor.tm.reverse_map[sample_pred['current_state']][0])
print("Actual Future EODB:", predictor.tm.reverse_map[sample_pred['actual_future_state']][0])
print("Top Predicted States EODB values:", [predictor.tm.reverse_map[state_id][0] for state_id, prob in sample_pred['top_predictions']])

# Feature Importance Analysis
feature_importance = analyze_feature_importance(predictions, predictor)
print("\nFeature Importance Analysis by Region Type:")
for feature, region_accuracy in feature_importance.items():
    print(f"\n{feature.capitalize()}:")
    for region, accuracy in region_accuracy.items():
        print(f"  {region.capitalize()} region accuracy: {accuracy:.2%}")

# Pattern Analysis
pattern_analysis = analyze_duration_slope_patterns(predictions, predictor)
print("\nDuration-Slope Pattern Analysis:")
print("\nLinear Region Patterns:")
for pattern, stats in sorted(pattern_analysis['linear_patterns'].items(), 
                           key=lambda x: x[1]['success']/x[1]['total'] if x[1]['total'] > 0 else 0, 
                           reverse=True)[:5]:
    success_rate = stats['success']/stats['total'] if stats['total'] > 0 else 0
    print(f"Duration: {pattern[0]}, Slope: {pattern[1]}")
    print(f"Success Rate: {success_rate:.2%} ({stats['success']}/{stats['total']} predictions)")

print("\nTransition Region Patterns:")
for pattern, stats in sorted(pattern_analysis['transition_patterns'].items(), 
                           key=lambda x: x[1]['success']/x[1]['total'] if x[1]['total'] > 0 else 0, 
                           reverse=True)[:5]:
    success_rate = stats['success']/stats['total'] if stats['total'] > 0 else 0
    print(f"Duration: {pattern[0]}, Slope: {pattern[1]}")
    print(f"Success Rate: {success_rate:.2%} ({stats['success']}/{stats['total']} predictions)")

# Save predictions
INPUT_CONFIG['output_path'].mkdir(exist_ok=True, parents=True)
output_file = INPUT_CONFIG['output_path'] / INPUT_CONFIG['output_filename']
pd.to_pickle(predictions, output_file)

print("\nPredictions completed and saved!")




Diagnostic Information:
Sample Original EODB values: [185.21708367541348, 170.32868592710147, 192.69222167317545, 298.3302093157721, 363.09006849715695]

Sample Prediction:
Current State EODB: 47
Actual Future EODB: 48
Top Predicted States EODB values: [46, 46, 46]

Feature Importance Analysis by Region Type:

Eodb:
  Linear region accuracy: 32.23%
  Transition region accuracy: 41.67%

Duration:
  Linear region accuracy: 10.26%
  Transition region accuracy: 0.00%

Slope:
  Linear region accuracy: 6.96%
  Transition region accuracy: 0.00%

Curvature:
  Linear region accuracy: 98.90%
  Transition region accuracy: 0.00%

Duration-Slope Pattern Analysis:

Linear Region Patterns:
Duration: 39, Slope: 32
Success Rate: 25.00% (1/4 predictions)
Duration: 39, Slope: 30
Success Rate: 0.00% (0/3 predictions)
Duration: 38, Slope: 31
Success Rate: 0.00% (0/3 predictions)
Duration: 41, Slope: 35
Success Rate: 0.00% (0/7 predictions)
Duration: 42, Slope: 35
Success Rate: 0.00% (0/5 predictions)

Tra