# base

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

def run_xgboost_model(df, continuous_features, binary_features, model_name, title_suffix):
    """
    Universal XGBoost model training and evaluation function with early stopping and SHAP analysis
    """
    
    # Data splitting by company in chronological order
    train_data_list = []
    val_data_list = []
    test_data_list = []

    for company in df['Company'].unique():
        company_data = df[df['Company'] == company].copy()
        company_data = company_data.sort_values('Date')
        
        n_company = len(company_data)
        train_size = int(0.7 * n_company)
        val_size = int(0.15 * n_company)
        
        train_data_list.append(company_data.iloc[:train_size])
        val_data_list.append(company_data.iloc[train_size:train_size+val_size])
        test_data_list.append(company_data.iloc[train_size+val_size:])

    train_data = pd.concat(train_data_list, ignore_index=True)
    val_data = pd.concat(val_data_list, ignore_index=True)
    test_data = pd.concat(test_data_list, ignore_index=True)

    print(f"Training set size: {len(train_data)}")
    print(f"Validation set size: {len(val_data)}")
    print(f"Test set size: {len(test_data)}")
    
    # Data preprocessing
    def preprocess_data(train_df, val_df, test_df, target_col, continuous_features, binary_features):
        all_features = continuous_features + binary_features
        
        train_df = train_df.dropna(subset=all_features + [target_col])
        val_df = val_df.dropna(subset=all_features + [target_col])
        test_df = test_df.dropna(subset=all_features + [target_col])
        
        X_train_continuous = train_df[continuous_features]
        X_val_continuous = val_df[continuous_features]
        X_test_continuous = test_df[continuous_features]
        
        X_train_binary = train_df[binary_features]
        X_val_binary = val_df[binary_features]
        X_test_binary = test_df[binary_features]
        
        y_train = train_df[target_col]
        y_val = val_df[target_col]
        y_test = test_df[target_col]
        
        # Standardize only continuous features
        scaler = StandardScaler()
        X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
        X_val_continuous_scaled = scaler.transform(X_val_continuous)
        X_test_continuous_scaled = scaler.transform(X_test_continuous)
        
        # Combine scaled continuous features with unscaled binary features
        X_train_scaled = np.hstack([X_train_continuous_scaled, X_train_binary.values])
        X_val_scaled = np.hstack([X_val_continuous_scaled, X_val_binary.values])
        X_test_scaled = np.hstack([X_test_continuous_scaled, X_test_binary.values])
        
        return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler

    def evaluate_model(y_true, y_pred, y_prob, model_name):
        cm = confusion_matrix(y_true, y_pred)
        f1_macro = f1_score(y_true, y_pred, average='macro')
        accuracy = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true, y_prob)
        precision_per_class = precision_score(y_true, y_pred, average=None)
        recall_per_class = recall_score(y_true, y_pred, average=None)
        f1_per_class = f1_score(y_true, y_pred, average=None)
        
        # Confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Predicted 0', 'Predicted 1'],
                    yticklabels=['Actual 0', 'Actual 1'])
        plt.title(f'{model_name} XGBoost {title_suffix} - Confusion Matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.show()
        
        # Performance metrics table
        plt.figure(figsize=(10, 8))
        plt.axis('tight')
        plt.axis('off')
        
        table_data = [
            ['Metric', 'Value'],
            ['Macro F1 Score', f'{f1_macro:.4f}'],
            ['Overall Accuracy', f'{accuracy:.4f}'],
            ['AUC', f'{auc:.4f}'],
            ['', ''],
            ['Class 1 Precision', f'{precision_per_class[1]:.4f}'],
            ['Class 1 Recall', f'{recall_per_class[1]:.4f}'],
            ['Class 1 F1', f'{f1_per_class[1]:.4f}'],
            ['', ''],
            ['Class 0 Precision', f'{precision_per_class[0]:.4f}'],
            ['Class 0 Recall', f'{recall_per_class[0]:.4f}'],
            ['Class 0 F1', f'{f1_per_class[0]:.4f}']
        ]
        
        table = plt.table(cellText=table_data[1:], colLabels=table_data[0],
                         cellLoc='center', loc='center', colWidths=[0.4, 0.3])
        table.auto_set_font_size(False)
        table.set_fontsize(12)
        table.scale(1.5, 2)
        
        for i in range(len(table_data)):
            for j in range(len(table_data[0])):
                cell = table[(i, j)]
                if i == 0:
                    cell.set_facecolor('#4CAF50')
                    cell.set_text_props(weight='bold', color='white')
                elif len(table_data[i]) > 0 and table_data[i][0] == '':
                    cell.set_facecolor('#ffffff')
                    cell.set_text_props(color='white')
                else:
                    cell.set_facecolor('#f0f0f0')
        
        plt.title(f'{model_name} XGBoost {title_suffix} - Performance Metrics', fontsize=16, fontweight='bold', pad=20)
        plt.show()
        
        return {'confusion_matrix': cm, 'f1_macro': f1_macro, 'accuracy': accuracy, 'auc': auc,
                'class_0_precision': precision_per_class[0], 'class_0_recall': recall_per_class[0], 'class_0_f1': f1_per_class[0],
                'class_1_precision': precision_per_class[1], 'class_1_recall': recall_per_class[1], 'class_1_f1': f1_per_class[1]}

    def generate_shap_analysis(model, X_test, feature_names, model_name):
        """
        Generate enhanced SHAP analysis with summary plot, waterfall plot, and feature importance ranking
        """
        print(f"\nGenerating SHAP analysis for {model_name}...")
        
        # Create SHAP explainer
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        
        # SHAP Summary Plot
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
        plt.title(f'SHAP Summary Plot - {model_name}', fontsize=14, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
        
        # SHAP Waterfall Plot for first test sample
        # Create explanation object for waterfall plot
        plt.figure(figsize=(12, 8))
        
        # Convert to pandas DataFrame for better compatibility
        X_test_df = pd.DataFrame(X_test, columns=feature_names)
        
        # Create Explanation object for waterfall plot
        explanation = shap.Explanation(
            values=shap_values[0], 
            base_values=explainer.expected_value, 
            data=X_test[0],
            feature_names=feature_names
        )
        
        shap.waterfall_plot(explanation, show=False)
        plt.title(f'SHAP Waterfall Plot - First Test Sample - {model_name}', 
                 fontsize=14, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
        
        # Calculate mean SHAP values (with direction) and mean absolute SHAP values (for sorting)
        mean_shap_values = np.mean(shap_values, axis=0)  
        mean_abs_shap_values = np.mean(np.abs(shap_values), axis=0)  
        
        # Create DataFrame with both metrics
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Mean_SHAP': mean_shap_values,  
            'Mean_Abs_SHAP': mean_abs_shap_values  
        }).sort_values('Mean_Abs_SHAP', ascending=False)  
        
        print(f"\nFeature Importance Ranking for {model_name} (sorted by absolute value, preserving direction):")
        print("=" * 75)
        print(f"{'Rank':<4} {'Feature':<25} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
        print("-" * 75)
        
        for i, (_, row) in enumerate(importance_df.iterrows(), 1):
            direction = "Positive" if row['Mean_SHAP'] > 0 else "Negative"
            direction_symbol = "↑" if row['Mean_SHAP'] > 0 else "↓"
            
            print(f"{i:2d}.  {row['Feature']:<25} {row['Mean_SHAP']:>+10.6f} {row['Mean_Abs_SHAP']:>10.6f}   {direction_symbol} {direction}")
        
        # Create visualization chart showing directional importance
        plt.figure(figsize=(12, 10))
        
        # Get sorted data
        sorted_features = importance_df['Feature'].values
        sorted_mean_shap = importance_df['Mean_SHAP'].values
        
        # Create colors: green for positive, red for negative
        colors = ['green' if x > 0 else 'red' for x in sorted_mean_shap]
        
        # Create horizontal bar chart
        y_pos = np.arange(len(sorted_features))
        bars = plt.barh(y_pos, sorted_mean_shap, color=colors, alpha=0.7)
        
        # Set labels and title
        plt.yticks(y_pos, sorted_features)
        plt.xlabel('Mean SHAP Value (with direction)')
        plt.title(f'Feature Importance with Direction - {model_name}\n(Sorted by absolute value, preserving positive/negative direction)', 
                 fontsize=14, fontweight='bold')
        
        # Add zero line
        plt.axvline(x=0, color='black', linestyle='-', alpha=0.8, linewidth=1)
        
        # Add legend
        from matplotlib.patches import Patch
        legend_elements = [Patch(facecolor='green', alpha=0.7, label='Positive Impact (promotes upward movement)'),
                          Patch(facecolor='red', alpha=0.7, label='Negative Impact (promotes downward movement)')]
        plt.legend(handles=legend_elements, loc='lower right')
        
        # Add value labels on bars
        for i, (bar, value) in enumerate(zip(bars, sorted_mean_shap)):
            if value > 0:
                plt.text(value + 0.0001, bar.get_y() + bar.get_height()/2, 
                        f'{value:.4f}', ha='left', va='center', fontsize=9)
            else:
                plt.text(value - 0.0001, bar.get_y() + bar.get_height()/2, 
                        f'{value:.4f}', ha='right', va='center', fontsize=9)
        
        plt.tight_layout()
        plt.show()
        
        return shap_values

    # Model training
    print("=" * 60)
    print(f"XGBoost {model_name}")
    print("=" * 60)
    
    # XGBoost hyperparameter grid for optimization
    param_grid = {
        'max_depth': [4,5,6],
        'learning_rate': [ 0.1, 0.15,0.2],
        'n_estimators': [200,300, 500],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9],
        'reg_alpha': [0,0.01, 0.1],
        'reg_lambda': [0.5,1, 2]
    }
    
    # Base XGBoost parameters
    base_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'min_child_weight': 1,
        'random_state': 42,
        'n_jobs': -1
    }
    
    feature_names = continuous_features + binary_features
    tscv = TimeSeriesSplit(n_splits=5)
    
    # 1D Model
    X_train_1d, y_train_1d, X_val_1d, y_val_1d, X_test_1d, y_test_1d, scaler_1d = preprocess_data(
        train_data, val_data, test_data, '1D_Up', continuous_features, binary_features)
    
    print("Hyperparameter tuning for 1D XGBoost model...")
    
    # Grid search with time series cross validation
    from sklearn.model_selection import GridSearchCV
    
    xgb_model = xgb.XGBClassifier(**base_params)
    
    grid_search_1d = GridSearchCV(
        xgb_model,
        param_grid,
        cv=tscv,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search_1d.fit(X_train_1d, y_train_1d)
    best_params_1d = grid_search_1d.best_params_
    
    print(f"Best parameters for 1D model: {best_params_1d}")
    print(f"Best cross-validation F1 score: {grid_search_1d.best_score_:.4f}")
    
    # Train final model with best parameters and early stopping
    final_params_1d = {**base_params, **best_params_1d}
    model_1d = xgb.XGBClassifier(**final_params_1d, early_stopping_rounds=50)
    
    model_1d.fit(
        X_train_1d, y_train_1d,
        eval_set=[(X_val_1d, y_val_1d)],
        verbose=False
    )
    
    print(f"1D Model stopped at iteration: {model_1d.best_iteration}")
    print(f"1D Model best validation AUC: {model_1d.best_score:.4f}")
    
    y_pred_1d = model_1d.predict(X_test_1d)
    y_prob_1d = model_1d.predict_proba(X_test_1d)[:, 1]
    
    results_1d = evaluate_model(y_test_1d, y_pred_1d, y_prob_1d, "1D_Up Prediction")
    
    # Generate SHAP analysis for 1D model
    shap_values_1d = generate_shap_analysis(model_1d, X_test_1d, feature_names, "1D_Up Prediction")
    
    # 20D Model
    X_train_20d, y_train_20d, X_val_20d, y_val_20d, X_test_20d, y_test_20d, scaler_20d = preprocess_data(
        train_data, val_data, test_data, '20D_Up', continuous_features, binary_features)
    
    print("\nHyperparameter tuning for 20D XGBoost model...")
    
    grid_search_20d = GridSearchCV(
        xgb.XGBClassifier(**base_params),
        param_grid,
        cv=tscv,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search_20d.fit(X_train_20d, y_train_20d)
    best_params_20d = grid_search_20d.best_params_
    
    print(f"Best parameters for 20D model: {best_params_20d}")
    print(f"Best cross-validation F1 score: {grid_search_20d.best_score_:.4f}")
    
    # Train final model with best parameters and early stopping
    final_params_20d = {**base_params, **best_params_20d}
    model_20d = xgb.XGBClassifier(**final_params_20d, early_stopping_rounds=50)
    
    model_20d.fit(
        X_train_20d, y_train_20d,
        eval_set=[(X_val_20d, y_val_20d)],
        verbose=False
    )
    
    print(f"20D Model stopped at iteration: {model_20d.best_iteration}")
    print(f"20D Model best validation AUC: {model_20d.best_score:.4f}")
    
    y_pred_20d = model_20d.predict(X_test_20d)
    y_prob_20d = model_20d.predict_proba(X_test_20d)[:, 1]
    
    results_20d = evaluate_model(y_test_20d, y_pred_20d, y_prob_20d, "20D_Up Prediction")
    
    # Generate SHAP analysis for 20D model
    shap_values_20d = generate_shap_analysis(model_20d, X_test_20d, feature_names, "20D_Up Prediction")
    
    # Model performance summary
    print(f"\nModel Performance Summary:")
    print(f"1D Model - AUC: {results_1d['auc']:.4f}, F1: {results_1d['f1_macro']:.4f}")
    print(f"20D Model - AUC: {results_20d['auc']:.4f}, F1: {results_20d['f1_macro']:.4f}")
    print(f"1D Best Params: {best_params_1d}")
    print(f"20D Best Params: {best_params_20d}")
    
    return results_1d, results_20d, model_1d, model_20d

# Load data
df = pd.read_csv('DATA.csv')

# Version 1: Baseline Model
continuous_features_base = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                           'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change']
binary_features_base = ['MA5_GT_MA20']

print("\nVERSION 1: Baseline Model (Technical Indicators Only)")
print("-" * 60)

results_baseline = run_xgboost_model(df, continuous_features_base, binary_features_base, 
                                    "Baseline", "Baseline")

# Version 2: Interest Rate Continuous
continuous_features_ir = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                          'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change', 'Interest_Rate']

print("\nVERSION 2: Interest Rate Continuous")
print("-" * 60)

results_ir = run_xgboost_model(df, continuous_features_ir, binary_features_base, 
                              "with Interest Rate", "with Interest Rate")

# Version 3: BTC

continuous_features_btc = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                          'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change', 'Bitcoin_Close']

print("\nVERSION 3: BTC")
print("-" * 60)

results_btc = run_xgboost_model(df, continuous_features_btc, binary_features_base,
                                     "with BTC", "with BTC")

# Version 4: Gold

continuous_features_gold = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                          'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change', 'Gold_Close']

print("\nVERSION 4: Gold")
print("-" * 60)

results__gold = run_xgboost_model(df, continuous_features_gold, binary_features_base,
                                     "with Gold", "with Gold")

