## base

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
warnings.filterwarnings('ignore')

plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

# Load data
df = pd.read_csv('DATA.csv')

# Define feature columns - Baseline model (only technical indicators)
continuous_features = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                      'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change']
binary_features = ['MA5_GT_MA20']
features = continuous_features + binary_features

# Split data by company in chronological order
train_data_list = []
val_data_list = []
test_data_list = []

for company in df['Company'].unique():
    company_data = df[df['Company'] == company].copy()
    company_data = company_data.sort_values('Date')  
    
    n_company = len(company_data)
    train_size = int(0.7 * n_company)
    val_size = int(0.15 * n_company)
    
    train_data_list.append(company_data.iloc[:train_size])
    val_data_list.append(company_data.iloc[train_size:train_size+val_size])
    test_data_list.append(company_data.iloc[train_size+val_size:])

# Concatenate all companies' data
train_data = pd.concat(train_data_list, ignore_index=True)
val_data = pd.concat(val_data_list, ignore_index=True)
test_data = pd.concat(test_data_list, ignore_index=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")
print(f"Training set company distribution:\n{train_data['Company'].value_counts().sort_index()}")
print(f"Test set company distribution:\n{test_data['Company'].value_counts().sort_index()}")

# Modified preprocessing function
def preprocess_data(train_df, val_df, test_df, target_col, continuous_features, binary_features):
    all_features = continuous_features + binary_features
    
    # Drop missing values
    train_df = train_df.dropna(subset=all_features + [target_col])
    val_df = val_df.dropna(subset=all_features + [target_col])
    test_df = test_df.dropna(subset=all_features + [target_col])
    
    # Separate continuous and binary features
    X_train_continuous = train_df[continuous_features]
    X_val_continuous = val_df[continuous_features]
    X_test_continuous = test_df[continuous_features]
    
    X_train_binary = train_df[binary_features]
    X_val_binary = val_df[binary_features]
    X_test_binary = test_df[binary_features]
    
    y_train = train_df[target_col]
    y_val = val_df[target_col]
    y_test = test_df[target_col]
    
    # Standardize only continuous features
    scaler = StandardScaler()
    X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
    X_val_continuous_scaled = scaler.transform(X_val_continuous)
    X_test_continuous_scaled = scaler.transform(X_test_continuous)
    
    # Combine scaled continuous features with unscaled binary features
    X_train_scaled = np.hstack([X_train_continuous_scaled, X_train_binary.values])
    X_val_scaled = np.hstack([X_val_continuous_scaled, X_val_binary.values])
    X_test_scaled = np.hstack([X_test_continuous_scaled, X_test_binary.values])
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler

# Evaluation function
def evaluate_model(y_true, y_pred, y_prob, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    # Overall metrics
    f1_macro = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    
    # Class-specific metrics
    precision_per_class = precision_score(y_true, y_pred, average=None)
    recall_per_class = recall_score(y_true, y_pred, average=None)
    f1_per_class = f1_score(y_true, y_pred, average=None)
    
    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # Performance metrics table
    plt.figure(figsize=(10, 8))
    plt.axis('tight')
    plt.axis('off')
    
    table_data = [
        ['Metric', 'Value'],
        ['Macro F1 Score', f'{f1_macro:.4f}'],
        ['Overall Accuracy', f'{accuracy:.4f}'],
        ['AUC', f'{auc:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 1 Precision', f'{precision_per_class[1]:.4f}'],
        ['Class 1 Recall', f'{recall_per_class[1]:.4f}'],
        ['Class 1 F1', f'{f1_per_class[1]:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 0 Precision', f'{precision_per_class[0]:.4f}'],
        ['Class 0 Recall', f'{recall_per_class[0]:.4f}'],
        ['Class 0 F1', f'{f1_per_class[0]:.4f}']
    ]
    
    table = plt.table(cellText=table_data[1:], colLabels=table_data[0],
                     cellLoc='center', loc='center',
                     colWidths=[0.4, 0.3])
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.5, 2)
    
    # Set table style
    for i in range(len(table_data)):
        for j in range(len(table_data[0])):
            cell = table[(i, j)]
            if i == 0:  # header
                cell.set_facecolor('#4CAF50')
                cell.set_text_props(weight='bold', color='white')
            elif len(table_data[i]) > 0 and table_data[i][0] == '':  # empty rows
                cell.set_facecolor('#ffffff')
                cell.set_text_props(color='white')
            else:
                cell.set_facecolor('#f0f0f0')
    
    plt.show()
    
    return {'confusion_matrix': cm, 'f1_macro': f1_macro, 'accuracy': accuracy, 'auc': auc,
            'class_0_precision': precision_per_class[0], 'class_0_recall': recall_per_class[0], 'class_0_f1': f1_per_class[0],
            'class_1_precision': precision_per_class[1], 'class_1_recall': recall_per_class[1], 'class_1_f1': f1_per_class[1]}

# 1D_Up prediction
print("=" * 60)
print("1D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_1d, y_train_1d, X_val_1d, y_val_1d, X_test_1d, y_test_1d, scaler_1d = preprocess_data(
    train_data, val_data, test_data, '1D_Up', continuous_features, binary_features)


tscv = TimeSeriesSplit(n_splits=5)
param_grid = {
    'n_estimators': [300, 400, 500], 
    'max_depth': [ 6, 8,10, None],    
    'min_samples_split': [2, 5, 8], 
    'min_samples_leaf': [1, 2, 3],  
    'max_features': ['sqrt', 'log2'] 
}

print("Starting hyperparameter tuning for 1D model...")
grid_search_1d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_1d.fit(X_train_1d, y_train_1d)
best_params_1d = grid_search_1d.best_params_
print(f"Best parameters for 1D model: {best_params_1d}")
print(f"Best cross-validation F1 score: {grid_search_1d.best_score_:.4f}")

# Train final model with best parameters
final_model_1d = RandomForestClassifier(**best_params_1d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_1d.fit(X_train_1d, y_train_1d)

# Evaluate on test set
y_pred_1d = final_model_1d.predict(X_test_1d)
y_prob_1d = final_model_1d.predict_proba(X_test_1d)[:, 1]

print("\n1D Model Test Set Results:")
results_1d = evaluate_model(y_test_1d, y_pred_1d, y_prob_1d, "1D_Up Prediction")

# 20D_Up prediction
print("\n" + "=" * 60)
print("20D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_20d, y_train_20d, X_val_20d, y_val_20d, X_test_20d, y_test_20d, scaler_20d = preprocess_data(
    train_data, val_data, test_data, '20D_Up', continuous_features, binary_features)

print("Starting hyperparameter tuning for 20D model...")
grid_search_20d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_20d.fit(X_train_20d, y_train_20d)
best_params_20d = grid_search_20d.best_params_
print(f"Best parameters for 20D model: {best_params_20d}")
print(f"Best cross-validation F1 score: {grid_search_20d.best_score_:.4f}")

# Train final model with best parameters
final_model_20d = RandomForestClassifier(**best_params_20d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_20d.fit(X_train_20d, y_train_20d)

# Evaluate on test set
y_pred_20d = final_model_20d.predict(X_test_20d)
y_prob_20d = final_model_20d.predict_proba(X_test_20d)[:, 1]

print("\n20D Model Test Set Results:")
results_20d = evaluate_model(y_test_20d, y_pred_20d, y_prob_20d, "20D_Up Prediction")

# SHAP Feature Importance Analysis
print("\n" + "=" * 60)
print("SHAP Feature Importance Analysis")
print("=" * 60)

# Get feature names
feature_names = continuous_features + binary_features
feature_names_array = np.array(feature_names)

# Create SHAP explainers for both models
print("Creating SHAP explainers...")
explainer_1d = shap.Explainer(final_model_1d, X_train_1d)
explainer_20d = shap.Explainer(final_model_20d, X_train_20d)

# Calculate SHAP values for test sets
print("Calculating SHAP values for 1D model...")
shap_values_1d = explainer_1d(X_test_1d)

print("Calculating SHAP values for 20D model...")
shap_values_20d = explainer_20d(X_test_20d)

# 1D Model SHAP Analysis
print("\n" + "=" * 40)
print("1D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 1D model - use only positive class SHAP values
if len(shap_values_1d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_1d = shap_values_1d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_1d = shap_values_1d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_1d, X_test_1d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 1D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (1D model)
plt.figure(figsize=(10, 6))
if len(shap_values_1d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_1d.values[0, :, 1],
        base_values=shap_values_1d.base_values[0, 1] if hasattr(shap_values_1d.base_values, 'shape') and len(shap_values_1d.base_values.shape) > 1 else shap_values_1d.base_values[0],
        data=shap_values_1d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_1d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (1D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 1D model
if len(shap_values_1d.values.shape) == 3:
    mean_shap_1d = np.mean(shap_values_1d.values[:, :, 1], axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values[:, :, 1]), axis=0)
else:
    mean_shap_1d = np.mean(shap_values_1d.values, axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values), axis=0)

importance_df_1d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_1d,
    'Abs_Mean_SHAP': abs_mean_shap_1d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 1D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_1d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

# 20D Model SHAP Analysis
print("\n" + "=" * 40)
print("20D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 20D model - use only positive class SHAP values
if len(shap_values_20d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_20d = shap_values_20d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_20d = shap_values_20d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_20d, X_test_20d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 20D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (20D model)
plt.figure(figsize=(10, 6))
if len(shap_values_20d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_20d.values[0, :, 1],
        base_values=shap_values_20d.base_values[0, 1] if hasattr(shap_values_20d.base_values, 'shape') and len(shap_values_20d.base_values.shape) > 1 else shap_values_20d.base_values[0],
        data=shap_values_20d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_20d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (20D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 20D model
if len(shap_values_20d.values.shape) == 3:
    mean_shap_20d = np.mean(shap_values_20d.values[:, :, 1], axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values[:, :, 1]), axis=0)
else:
    mean_shap_20d = np.mean(shap_values_20d.values, axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values), axis=0)

importance_df_20d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_20d,
    'Abs_Mean_SHAP': abs_mean_shap_20d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 20D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_20d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

## interest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
warnings.filterwarnings('ignore')

plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

# Load data
df = pd.read_csv('DATA.csv')

# Define feature columns 
continuous_features = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                      'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change',"Interest_Rate"]
binary_features = ['MA5_GT_MA20']
features = continuous_features + binary_features

# Split data by company in chronological order
train_data_list = []
val_data_list = []
test_data_list = []

for company in df['Company'].unique():
    company_data = df[df['Company'] == company].copy()
    company_data = company_data.sort_values('Date') 
    
    n_company = len(company_data)
    train_size = int(0.7 * n_company)
    val_size = int(0.15 * n_company)
    
    train_data_list.append(company_data.iloc[:train_size])
    val_data_list.append(company_data.iloc[train_size:train_size+val_size])
    test_data_list.append(company_data.iloc[train_size+val_size:])

# Concatenate all companies' data
train_data = pd.concat(train_data_list, ignore_index=True)
val_data = pd.concat(val_data_list, ignore_index=True)
test_data = pd.concat(test_data_list, ignore_index=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")
print(f"Training set company distribution:\n{train_data['Company'].value_counts().sort_index()}")
print(f"Test set company distribution:\n{test_data['Company'].value_counts().sort_index()}")

# Modified preprocessing function
def preprocess_data(train_df, val_df, test_df, target_col, continuous_features, binary_features):
    all_features = continuous_features + binary_features
    
    # Drop missing values
    train_df = train_df.dropna(subset=all_features + [target_col])
    val_df = val_df.dropna(subset=all_features + [target_col])
    test_df = test_df.dropna(subset=all_features + [target_col])
    
    # Separate continuous and binary features
    X_train_continuous = train_df[continuous_features]
    X_val_continuous = val_df[continuous_features]
    X_test_continuous = test_df[continuous_features]
    
    X_train_binary = train_df[binary_features]
    X_val_binary = val_df[binary_features]
    X_test_binary = test_df[binary_features]
    
    y_train = train_df[target_col]
    y_val = val_df[target_col]
    y_test = test_df[target_col]
    
    # Standardize only continuous features
    scaler = StandardScaler()
    X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
    X_val_continuous_scaled = scaler.transform(X_val_continuous)
    X_test_continuous_scaled = scaler.transform(X_test_continuous)
    
    # Combine scaled continuous features with unscaled binary features
    X_train_scaled = np.hstack([X_train_continuous_scaled, X_train_binary.values])
    X_val_scaled = np.hstack([X_val_continuous_scaled, X_val_binary.values])
    X_test_scaled = np.hstack([X_test_continuous_scaled, X_test_binary.values])
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler

# Evaluation function
def evaluate_model(y_true, y_pred, y_prob, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    # Overall metrics
    f1_macro = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    
    # Class-specific metrics
    precision_per_class = precision_score(y_true, y_pred, average=None)
    recall_per_class = recall_score(y_true, y_pred, average=None)
    f1_per_class = f1_score(y_true, y_pred, average=None)
    
    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # Performance metrics table
    plt.figure(figsize=(10, 8))
    plt.axis('tight')
    plt.axis('off')
    
    table_data = [
        ['Metric', 'Value'],
        ['Macro F1 Score', f'{f1_macro:.4f}'],
        ['Overall Accuracy', f'{accuracy:.4f}'],
        ['AUC', f'{auc:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 1 Precision', f'{precision_per_class[1]:.4f}'],
        ['Class 1 Recall', f'{recall_per_class[1]:.4f}'],
        ['Class 1 F1', f'{f1_per_class[1]:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 0 Precision', f'{precision_per_class[0]:.4f}'],
        ['Class 0 Recall', f'{recall_per_class[0]:.4f}'],
        ['Class 0 F1', f'{f1_per_class[0]:.4f}']
    ]
    
    table = plt.table(cellText=table_data[1:], colLabels=table_data[0],
                     cellLoc='center', loc='center',
                     colWidths=[0.4, 0.3])
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.5, 2)
    
    # Set table style
    for i in range(len(table_data)):
        for j in range(len(table_data[0])):
            cell = table[(i, j)]
            if i == 0:  # header
                cell.set_facecolor('#4CAF50')
                cell.set_text_props(weight='bold', color='white')
            elif len(table_data[i]) > 0 and table_data[i][0] == '':  # empty rows
                cell.set_facecolor('#ffffff')
                cell.set_text_props(color='white')
            else:
                cell.set_facecolor('#f0f0f0')
    
    plt.show()
    
    return {'confusion_matrix': cm, 'f1_macro': f1_macro, 'accuracy': accuracy, 'auc': auc,
            'class_0_precision': precision_per_class[0], 'class_0_recall': recall_per_class[0], 'class_0_f1': f1_per_class[0],
            'class_1_precision': precision_per_class[1], 'class_1_recall': recall_per_class[1], 'class_1_f1': f1_per_class[1]}

# 1D_Up prediction
print("=" * 60)
print("1D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_1d, y_train_1d, X_val_1d, y_val_1d, X_test_1d, y_test_1d, scaler_1d = preprocess_data(
    train_data, val_data, test_data, '1D_Up', continuous_features, binary_features)


tscv = TimeSeriesSplit(n_splits=5)
param_grid = {
    'n_estimators': [300, 400, 500], 
    'max_depth': [ 6, 8,10, None],    
    'min_samples_split': [2, 5, 8], 
    'min_samples_leaf': [1, 2, 3],  
    'max_features': ['sqrt', 'log2'] 
}

print("Starting hyperparameter tuning for 1D model...")
grid_search_1d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_1d.fit(X_train_1d, y_train_1d)
best_params_1d = grid_search_1d.best_params_
print(f"Best parameters for 1D model: {best_params_1d}")
print(f"Best cross-validation F1 score: {grid_search_1d.best_score_:.4f}")

# Train final model with best parameters
final_model_1d = RandomForestClassifier(**best_params_1d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_1d.fit(X_train_1d, y_train_1d)

# Evaluate on test set
y_pred_1d = final_model_1d.predict(X_test_1d)
y_prob_1d = final_model_1d.predict_proba(X_test_1d)[:, 1]

print("\n1D Model Test Set Results:")
results_1d = evaluate_model(y_test_1d, y_pred_1d, y_prob_1d, "1D_Up Prediction")

# 20D_Up prediction
print("\n" + "=" * 60)
print("20D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_20d, y_train_20d, X_val_20d, y_val_20d, X_test_20d, y_test_20d, scaler_20d = preprocess_data(
    train_data, val_data, test_data, '20D_Up', continuous_features, binary_features)

print("Starting hyperparameter tuning for 20D model...")
grid_search_20d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_20d.fit(X_train_20d, y_train_20d)
best_params_20d = grid_search_20d.best_params_
print(f"Best parameters for 20D model: {best_params_20d}")
print(f"Best cross-validation F1 score: {grid_search_20d.best_score_:.4f}")

# Train final model with best parameters
final_model_20d = RandomForestClassifier(**best_params_20d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_20d.fit(X_train_20d, y_train_20d)

# Evaluate on test set
y_pred_20d = final_model_20d.predict(X_test_20d)
y_prob_20d = final_model_20d.predict_proba(X_test_20d)[:, 1]

print("\n20D Model Test Set Results:")
results_20d = evaluate_model(y_test_20d, y_pred_20d, y_prob_20d, "20D_Up Prediction")

# SHAP Feature Importance Analysis
print("\n" + "=" * 60)
print("SHAP Feature Importance Analysis")
print("=" * 60)

# Get feature names
feature_names = continuous_features + binary_features
feature_names_array = np.array(feature_names)

# Create SHAP explainers for both models
print("Creating SHAP explainers...")
explainer_1d = shap.Explainer(final_model_1d, X_train_1d)
explainer_20d = shap.Explainer(final_model_20d, X_train_20d)

# Calculate SHAP values for test sets
print("Calculating SHAP values for 1D model...")
shap_values_1d = explainer_1d(X_test_1d,check_additivity=False)

print("Calculating SHAP values for 20D model...")
shap_values_20d = explainer_20d(X_test_20d,check_additivity=False)

# 1D Model SHAP Analysis
print("\n" + "=" * 40)
print("1D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 1D model - use only positive class SHAP values
if len(shap_values_1d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_1d = shap_values_1d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_1d = shap_values_1d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_1d, X_test_1d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 1D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (1D model)
plt.figure(figsize=(10, 6))
if len(shap_values_1d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_1d.values[0, :, 1],
        base_values=shap_values_1d.base_values[0, 1] if hasattr(shap_values_1d.base_values, 'shape') and len(shap_values_1d.base_values.shape) > 1 else shap_values_1d.base_values[0],
        data=shap_values_1d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_1d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (1D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 1D model
if len(shap_values_1d.values.shape) == 3:
    mean_shap_1d = np.mean(shap_values_1d.values[:, :, 1], axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values[:, :, 1]), axis=0)
else:
    mean_shap_1d = np.mean(shap_values_1d.values, axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values), axis=0)

importance_df_1d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_1d,
    'Abs_Mean_SHAP': abs_mean_shap_1d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 1D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_1d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

# 20D Model SHAP Analysis
print("\n" + "=" * 40)
print("20D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 20D model - use only positive class SHAP values
if len(shap_values_20d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_20d = shap_values_20d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_20d = shap_values_20d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_20d, X_test_20d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 20D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (20D model)
plt.figure(figsize=(10, 6))
if len(shap_values_20d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_20d.values[0, :, 1],
        base_values=shap_values_20d.base_values[0, 1] if hasattr(shap_values_20d.base_values, 'shape') and len(shap_values_20d.base_values.shape) > 1 else shap_values_20d.base_values[0],
        data=shap_values_20d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_20d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (20D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 20D model
if len(shap_values_20d.values.shape) == 3:
    mean_shap_20d = np.mean(shap_values_20d.values[:, :, 1], axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values[:, :, 1]), axis=0)
else:
    mean_shap_20d = np.mean(shap_values_20d.values, axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values), axis=0)

importance_df_20d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_20d,
    'Abs_Mean_SHAP': abs_mean_shap_20d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 20D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_20d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

## BTC

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
warnings.filterwarnings('ignore')

plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

# Load data
df = pd.read_csv('DATA.csv')

# Define feature columns 
continuous_features = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                      'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change',"Bitcoin_Close"]
binary_features = ['MA5_GT_MA20']
features = continuous_features + binary_features

# Split data by company in chronological order
train_data_list = []
val_data_list = []
test_data_list = []

for company in df['Company'].unique():
    company_data = df[df['Company'] == company].copy()
    company_data = company_data.sort_values('Date')  
    
    n_company = len(company_data)
    train_size = int(0.7 * n_company)
    val_size = int(0.15 * n_company)
    
    train_data_list.append(company_data.iloc[:train_size])
    val_data_list.append(company_data.iloc[train_size:train_size+val_size])
    test_data_list.append(company_data.iloc[train_size+val_size:])

# Concatenate all companies' data
train_data = pd.concat(train_data_list, ignore_index=True)
val_data = pd.concat(val_data_list, ignore_index=True)
test_data = pd.concat(test_data_list, ignore_index=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")
print(f"Training set company distribution:\n{train_data['Company'].value_counts().sort_index()}")
print(f"Test set company distribution:\n{test_data['Company'].value_counts().sort_index()}")

# Modified preprocessing function
def preprocess_data(train_df, val_df, test_df, target_col, continuous_features, binary_features):
    all_features = continuous_features + binary_features
    
    # Drop missing values
    train_df = train_df.dropna(subset=all_features + [target_col])
    val_df = val_df.dropna(subset=all_features + [target_col])
    test_df = test_df.dropna(subset=all_features + [target_col])
    
    # Separate continuous and binary features
    X_train_continuous = train_df[continuous_features]
    X_val_continuous = val_df[continuous_features]
    X_test_continuous = test_df[continuous_features]
    
    X_train_binary = train_df[binary_features]
    X_val_binary = val_df[binary_features]
    X_test_binary = test_df[binary_features]
    
    y_train = train_df[target_col]
    y_val = val_df[target_col]
    y_test = test_df[target_col]
    
    # Standardize only continuous features
    scaler = StandardScaler()
    X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
    X_val_continuous_scaled = scaler.transform(X_val_continuous)
    X_test_continuous_scaled = scaler.transform(X_test_continuous)
    
    # Combine scaled continuous features with unscaled binary features
    X_train_scaled = np.hstack([X_train_continuous_scaled, X_train_binary.values])
    X_val_scaled = np.hstack([X_val_continuous_scaled, X_val_binary.values])
    X_test_scaled = np.hstack([X_test_continuous_scaled, X_test_binary.values])
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler

# Evaluation function
def evaluate_model(y_true, y_pred, y_prob, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    # Overall metrics
    f1_macro = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    
    # Class-specific metrics
    precision_per_class = precision_score(y_true, y_pred, average=None)
    recall_per_class = recall_score(y_true, y_pred, average=None)
    f1_per_class = f1_score(y_true, y_pred, average=None)
    
    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # Performance metrics table
    plt.figure(figsize=(10, 8))
    plt.axis('tight')
    plt.axis('off')
    
    table_data = [
        ['Metric', 'Value'],
        ['Macro F1 Score', f'{f1_macro:.4f}'],
        ['Overall Accuracy', f'{accuracy:.4f}'],
        ['AUC', f'{auc:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 1 Precision', f'{precision_per_class[1]:.4f}'],
        ['Class 1 Recall', f'{recall_per_class[1]:.4f}'],
        ['Class 1 F1', f'{f1_per_class[1]:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 0 Precision', f'{precision_per_class[0]:.4f}'],
        ['Class 0 Recall', f'{recall_per_class[0]:.4f}'],
        ['Class 0 F1', f'{f1_per_class[0]:.4f}']
    ]
    
    table = plt.table(cellText=table_data[1:], colLabels=table_data[0],
                     cellLoc='center', loc='center',
                     colWidths=[0.4, 0.3])
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.5, 2)
    
    # Set table style
    for i in range(len(table_data)):
        for j in range(len(table_data[0])):
            cell = table[(i, j)]
            if i == 0:  # header
                cell.set_facecolor('#4CAF50')
                cell.set_text_props(weight='bold', color='white')
            elif len(table_data[i]) > 0 and table_data[i][0] == '':  
                cell.set_facecolor('#ffffff')
                cell.set_text_props(color='white')
            else:
                cell.set_facecolor('#f0f0f0')
    
    plt.show()
    
    return {'confusion_matrix': cm, 'f1_macro': f1_macro, 'accuracy': accuracy, 'auc': auc,
            'class_0_precision': precision_per_class[0], 'class_0_recall': recall_per_class[0], 'class_0_f1': f1_per_class[0],
            'class_1_precision': precision_per_class[1], 'class_1_recall': recall_per_class[1], 'class_1_f1': f1_per_class[1]}

# 1D_Up prediction
print("=" * 60)
print("1D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_1d, y_train_1d, X_val_1d, y_val_1d, X_test_1d, y_test_1d, scaler_1d = preprocess_data(
    train_data, val_data, test_data, '1D_Up', continuous_features, binary_features)


tscv = TimeSeriesSplit(n_splits=5)
param_grid = {
    'n_estimators': [300, 400, 500], 
    'max_depth': [ 6, 8,10, None],    
    'min_samples_split': [2, 5, 8], 
    'min_samples_leaf': [1, 2, 3],  
    'max_features': ['sqrt', 'log2'] 
}

print("Starting hyperparameter tuning for 1D model...")
grid_search_1d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_1d.fit(X_train_1d, y_train_1d)
best_params_1d = grid_search_1d.best_params_
print(f"Best parameters for 1D model: {best_params_1d}")
print(f"Best cross-validation F1 score: {grid_search_1d.best_score_:.4f}")

# Train final model with best parameters
final_model_1d = RandomForestClassifier(**best_params_1d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_1d.fit(X_train_1d, y_train_1d)

# Evaluate on test set
y_pred_1d = final_model_1d.predict(X_test_1d)
y_prob_1d = final_model_1d.predict_proba(X_test_1d)[:, 1]

print("\n1D Model Test Set Results:")
results_1d = evaluate_model(y_test_1d, y_pred_1d, y_prob_1d, "1D_Up Prediction")

# 20D_Up prediction
print("\n" + "=" * 60)
print("20D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_20d, y_train_20d, X_val_20d, y_val_20d, X_test_20d, y_test_20d, scaler_20d = preprocess_data(
    train_data, val_data, test_data, '20D_Up', continuous_features, binary_features)

print("Starting hyperparameter tuning for 20D model...")
grid_search_20d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_20d.fit(X_train_20d, y_train_20d)
best_params_20d = grid_search_20d.best_params_
print(f"Best parameters for 20D model: {best_params_20d}")
print(f"Best cross-validation F1 score: {grid_search_20d.best_score_:.4f}")

# Train final model with best parameters
final_model_20d = RandomForestClassifier(**best_params_20d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_20d.fit(X_train_20d, y_train_20d)

# Evaluate on test set
y_pred_20d = final_model_20d.predict(X_test_20d)
y_prob_20d = final_model_20d.predict_proba(X_test_20d)[:, 1]

print("\n20D Model Test Set Results:")
results_20d = evaluate_model(y_test_20d, y_pred_20d, y_prob_20d, "20D_Up Prediction")

# SHAP Feature Importance Analysis
print("\n" + "=" * 60)
print("SHAP Feature Importance Analysis")
print("=" * 60)

# Get feature names
feature_names = continuous_features + binary_features
feature_names_array = np.array(feature_names)

# Create SHAP explainers for both models
print("Creating SHAP explainers...")
explainer_1d = shap.Explainer(final_model_1d, X_train_1d)
explainer_20d = shap.Explainer(final_model_20d, X_train_20d)

# Calculate SHAP values for test sets
print("Calculating SHAP values for 1D model...")
shap_values_1d = explainer_1d(X_test_1d,check_additivity=False)

print("Calculating SHAP values for 20D model...")
shap_values_20d = explainer_20d(X_test_20d,check_additivity=False)

# 1D Model SHAP Analysis
print("\n" + "=" * 40)
print("1D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 1D model - use only positive class SHAP values
if len(shap_values_1d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_1d = shap_values_1d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_1d = shap_values_1d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_1d, X_test_1d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 1D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (1D model)
plt.figure(figsize=(10, 6))
if len(shap_values_1d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_1d.values[0, :, 1],
        base_values=shap_values_1d.base_values[0, 1] if hasattr(shap_values_1d.base_values, 'shape') and len(shap_values_1d.base_values.shape) > 1 else shap_values_1d.base_values[0],
        data=shap_values_1d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_1d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (1D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 1D model
if len(shap_values_1d.values.shape) == 3:
    mean_shap_1d = np.mean(shap_values_1d.values[:, :, 1], axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values[:, :, 1]), axis=0)
else:
    mean_shap_1d = np.mean(shap_values_1d.values, axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values), axis=0)

importance_df_1d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_1d,
    'Abs_Mean_SHAP': abs_mean_shap_1d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 1D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_1d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

# 20D Model SHAP Analysis
print("\n" + "=" * 40)
print("20D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 20D model - use only positive class SHAP values
if len(shap_values_20d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_20d = shap_values_20d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_20d = shap_values_20d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_20d, X_test_20d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 20D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (20D model)
plt.figure(figsize=(10, 6))
if len(shap_values_20d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_20d.values[0, :, 1],
        base_values=shap_values_20d.base_values[0, 1] if hasattr(shap_values_20d.base_values, 'shape') and len(shap_values_20d.base_values.shape) > 1 else shap_values_20d.base_values[0],
        data=shap_values_20d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_20d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (20D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 20D model
if len(shap_values_20d.values.shape) == 3:
    mean_shap_20d = np.mean(shap_values_20d.values[:, :, 1], axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values[:, :, 1]), axis=0)
else:
    mean_shap_20d = np.mean(shap_values_20d.values, axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values), axis=0)

importance_df_20d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_20d,
    'Abs_Mean_SHAP': abs_mean_shap_20d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 20D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_20d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

## Gold

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
warnings.filterwarnings('ignore')

plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (12, 8)

# Load data
df = pd.read_csv('DATA.csv')

# Define feature columns 
continuous_features = ['1D_PastChangePct', '5D_PastChangePct', '20D_PastChangePct', 'J', 
                      'mfi', 'MACD', 'MACD_diff',  'BB_rel_pos', 'Vol_Change',"Gold_Close"]
binary_features = ['MA5_GT_MA20']
features = continuous_features + binary_features

# Split data by company in chronological order
train_data_list = []
val_data_list = []
test_data_list = []

for company in df['Company'].unique():
    company_data = df[df['Company'] == company].copy()
    company_data = company_data.sort_values('Date')  # Ensure chronological order
    
    n_company = len(company_data)
    train_size = int(0.7 * n_company)
    val_size = int(0.15 * n_company)
    
    train_data_list.append(company_data.iloc[:train_size])
    val_data_list.append(company_data.iloc[train_size:train_size+val_size])
    test_data_list.append(company_data.iloc[train_size+val_size:])

# Concatenate all companies' data
train_data = pd.concat(train_data_list, ignore_index=True)
val_data = pd.concat(val_data_list, ignore_index=True)
test_data = pd.concat(test_data_list, ignore_index=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")
print(f"Training set company distribution:\n{train_data['Company'].value_counts().sort_index()}")
print(f"Test set company distribution:\n{test_data['Company'].value_counts().sort_index()}")

# Modified preprocessing function
def preprocess_data(train_df, val_df, test_df, target_col, continuous_features, binary_features):
    all_features = continuous_features + binary_features
    
    # Drop missing values
    train_df = train_df.dropna(subset=all_features + [target_col])
    val_df = val_df.dropna(subset=all_features + [target_col])
    test_df = test_df.dropna(subset=all_features + [target_col])
    
    # Separate continuous and binary features
    X_train_continuous = train_df[continuous_features]
    X_val_continuous = val_df[continuous_features]
    X_test_continuous = test_df[continuous_features]
    
    X_train_binary = train_df[binary_features]
    X_val_binary = val_df[binary_features]
    X_test_binary = test_df[binary_features]
    
    y_train = train_df[target_col]
    y_val = val_df[target_col]
    y_test = test_df[target_col]
    
    # Standardize only continuous features
    scaler = StandardScaler()
    X_train_continuous_scaled = scaler.fit_transform(X_train_continuous)
    X_val_continuous_scaled = scaler.transform(X_val_continuous)
    X_test_continuous_scaled = scaler.transform(X_test_continuous)
    
    # Combine scaled continuous features with unscaled binary features
    X_train_scaled = np.hstack([X_train_continuous_scaled, X_train_binary.values])
    X_val_scaled = np.hstack([X_val_continuous_scaled, X_val_binary.values])
    X_test_scaled = np.hstack([X_test_continuous_scaled, X_test_binary.values])
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, scaler

# Evaluation function
def evaluate_model(y_true, y_pred, y_prob, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    # Overall metrics
    f1_macro = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    
    # Class-specific metrics
    precision_per_class = precision_score(y_true, y_pred, average=None)
    recall_per_class = recall_score(y_true, y_pred, average=None)
    f1_per_class = f1_score(y_true, y_pred, average=None)
    
    # Confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # Performance metrics table
    plt.figure(figsize=(10, 8))
    plt.axis('tight')
    plt.axis('off')
    
    table_data = [
        ['Metric', 'Value'],
        ['Macro F1 Score', f'{f1_macro:.4f}'],
        ['Overall Accuracy', f'{accuracy:.4f}'],
        ['AUC', f'{auc:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 1 Precision', f'{precision_per_class[1]:.4f}'],
        ['Class 1 Recall', f'{recall_per_class[1]:.4f}'],
        ['Class 1 F1', f'{f1_per_class[1]:.4f}'],
        ['', ''],  # Empty row for separation
        ['Class 0 Precision', f'{precision_per_class[0]:.4f}'],
        ['Class 0 Recall', f'{recall_per_class[0]:.4f}'],
        ['Class 0 F1', f'{f1_per_class[0]:.4f}']
    ]
    
    table = plt.table(cellText=table_data[1:], colLabels=table_data[0],
                     cellLoc='center', loc='center',
                     colWidths=[0.4, 0.3])
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1.5, 2)
    
    # Set table style
    for i in range(len(table_data)):
        for j in range(len(table_data[0])):
            cell = table[(i, j)]
            if i == 0:  # header
                cell.set_facecolor('#4CAF50')
                cell.set_text_props(weight='bold', color='white')
            elif len(table_data[i]) > 0 and table_data[i][0] == '':  # empty rows
                cell.set_facecolor('#ffffff')
                cell.set_text_props(color='white')
            else:
                cell.set_facecolor('#f0f0f0')
    
    plt.show()
    
    return {'confusion_matrix': cm, 'f1_macro': f1_macro, 'accuracy': accuracy, 'auc': auc,
            'class_0_precision': precision_per_class[0], 'class_0_recall': recall_per_class[0], 'class_0_f1': f1_per_class[0],
            'class_1_precision': precision_per_class[1], 'class_1_recall': recall_per_class[1], 'class_1_f1': f1_per_class[1]}

# 1D_Up prediction
print("=" * 60)
print("1D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_1d, y_train_1d, X_val_1d, y_val_1d, X_test_1d, y_test_1d, scaler_1d = preprocess_data(
    train_data, val_data, test_data, '1D_Up', continuous_features, binary_features)


tscv = TimeSeriesSplit(n_splits=5)
param_grid = {
    'n_estimators': [300, 400, 500], 
    'max_depth': [ 6, 8,10, None],    
    'min_samples_split': [2, 5, 8], 
    'min_samples_leaf': [1, 2, 3],  
    'max_features': ['sqrt', 'log2'] 
}

print("Starting hyperparameter tuning for 1D model...")
grid_search_1d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_1d.fit(X_train_1d, y_train_1d)
best_params_1d = grid_search_1d.best_params_
print(f"Best parameters for 1D model: {best_params_1d}")
print(f"Best cross-validation F1 score: {grid_search_1d.best_score_:.4f}")

# Train final model with best parameters
final_model_1d = RandomForestClassifier(**best_params_1d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_1d.fit(X_train_1d, y_train_1d)

# Evaluate on test set
y_pred_1d = final_model_1d.predict(X_test_1d)
y_prob_1d = final_model_1d.predict_proba(X_test_1d)[:, 1]

print("\n1D Model Test Set Results:")
results_1d = evaluate_model(y_test_1d, y_pred_1d, y_prob_1d, "1D_Up Prediction")

# 20D_Up prediction
print("\n" + "=" * 60)
print("20D_Up Prediction Model - Random Forest Baseline")
print("=" * 60)

X_train_20d, y_train_20d, X_val_20d, y_val_20d, X_test_20d, y_test_20d, scaler_20d = preprocess_data(
    train_data, val_data, test_data, '20D_Up', continuous_features, binary_features)

print("Starting hyperparameter tuning for 20D model...")
grid_search_20d = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1,class_weight='balanced'),
    param_grid, cv=tscv, scoring='f1', n_jobs=-1, verbose=1
)

grid_search_20d.fit(X_train_20d, y_train_20d)
best_params_20d = grid_search_20d.best_params_
print(f"Best parameters for 20D model: {best_params_20d}")
print(f"Best cross-validation F1 score: {grid_search_20d.best_score_:.4f}")

# Train final model with best parameters
final_model_20d = RandomForestClassifier(**best_params_20d, random_state=42, n_jobs=-1,class_weight='balanced')
final_model_20d.fit(X_train_20d, y_train_20d)

# Evaluate on test set
y_pred_20d = final_model_20d.predict(X_test_20d)
y_prob_20d = final_model_20d.predict_proba(X_test_20d)[:, 1]

print("\n20D Model Test Set Results:")
results_20d = evaluate_model(y_test_20d, y_pred_20d, y_prob_20d, "20D_Up Prediction")

# SHAP Feature Importance Analysis
print("\n" + "=" * 60)
print("SHAP Feature Importance Analysis")
print("=" * 60)

# Get feature names
feature_names = continuous_features + binary_features
feature_names_array = np.array(feature_names)

# Create SHAP explainers for both models
print("Creating SHAP explainers...")
explainer_1d = shap.Explainer(final_model_1d, X_train_1d)
explainer_20d = shap.Explainer(final_model_20d, X_train_20d)

# Calculate SHAP values for test sets
print("Calculating SHAP values for 1D model...")
shap_values_1d = explainer_1d(X_test_1d,check_additivity=False)

print("Calculating SHAP values for 20D model...")
shap_values_20d = explainer_20d(X_test_20d,check_additivity=False)

# 1D Model SHAP Analysis
print("\n" + "=" * 40)
print("1D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 1D model - use only positive class SHAP values
if len(shap_values_1d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_1d = shap_values_1d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_1d = shap_values_1d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_1d, X_test_1d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 1D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (1D model)
plt.figure(figsize=(10, 6))
if len(shap_values_1d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_1d.values[0, :, 1],
        base_values=shap_values_1d.base_values[0, 1] if hasattr(shap_values_1d.base_values, 'shape') and len(shap_values_1d.base_values.shape) > 1 else shap_values_1d.base_values[0],
        data=shap_values_1d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_1d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (1D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 1D model
if len(shap_values_1d.values.shape) == 3:
    mean_shap_1d = np.mean(shap_values_1d.values[:, :, 1], axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values[:, :, 1]), axis=0)
else:
    mean_shap_1d = np.mean(shap_values_1d.values, axis=0)
    abs_mean_shap_1d = np.mean(np.abs(shap_values_1d.values), axis=0)

importance_df_1d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_1d,
    'Abs_Mean_SHAP': abs_mean_shap_1d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 1D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_1d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")

# 20D Model SHAP Analysis
print("\n" + "=" * 40)
print("20D_Up Model SHAP Analysis")
print("=" * 40)

# Summary plot for 20D model - use only positive class SHAP values
if len(shap_values_20d.values.shape) == 3:
    # Multi-class case - use positive class (class 1)
    shap_vals_20d = shap_values_20d.values[:, :, 1]
else:
    # Binary case - use all values
    shap_vals_20d = shap_values_20d.values

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_vals_20d, X_test_20d, feature_names=feature_names_array, show=False)
plt.title('SHAP Summary Plot - 20D_Up Prediction')
plt.tight_layout()
plt.show()

# Waterfall plot for a single sample (20D model)
plt.figure(figsize=(10, 6))
if len(shap_values_20d.values.shape) == 3:
    # For multi-class, create a new Explanation object for single class
    sample_explanation = shap.Explanation(
        values=shap_values_20d.values[0, :, 1],
        base_values=shap_values_20d.base_values[0, 1] if hasattr(shap_values_20d.base_values, 'shape') and len(shap_values_20d.base_values.shape) > 1 else shap_values_20d.base_values[0],
        data=shap_values_20d.data[0],
        feature_names=feature_names_array
    )
else:
    sample_explanation = shap_values_20d[0]

shap.waterfall_plot(sample_explanation, show=False)
plt.title('SHAP Waterfall Plot - Single Sample (20D_Up Prediction)')
plt.tight_layout()
plt.show()

# Feature importance ranking table for 20D model
if len(shap_values_20d.values.shape) == 3:
    mean_shap_20d = np.mean(shap_values_20d.values[:, :, 1], axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values[:, :, 1]), axis=0)
else:
    mean_shap_20d = np.mean(shap_values_20d.values, axis=0)
    abs_mean_shap_20d = np.mean(np.abs(shap_values_20d.values), axis=0)

importance_df_20d = pd.DataFrame({
    'Feature': feature_names,
    'Mean_SHAP': mean_shap_20d,
    'Abs_Mean_SHAP': abs_mean_shap_20d
}).sort_values('Abs_Mean_SHAP', ascending=False)

print("\nFeature Importance Ranking for 20D_Up Prediction (sorted by absolute value, preserving direction):")
print("=" * 75)
print(f"{'Rank':<4} {'Feature':<20} {'Mean SHAP':<12} {'|Mean SHAP|':<12} {'Direction':<12}")
print("-" * 75)

for i, (_, row) in enumerate(importance_df_20d.iterrows(), 1):
    direction = "↑ Positive" if row['Mean_SHAP'] > 0 else "↓ Negative"
    print(f"{i:>2}. {row['Feature']:<20} {row['Mean_SHAP']:>+9.6f} {row['Abs_Mean_SHAP']:>11.6f} {direction}")