### 1.Import libraries and define coin path

In [35]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

coin_info = {
    "bitcoin": "outputs/bitcoin_merged_sentiment.csv",
    "ethereum": "outputs/ethereum_merged_sentiment.csv",
    "solana": "outputs/solana_merged_sentiment.csv",
    "cardano": "outputs/cardano_merged_sentiment.csv"
}

### 2.Define function to create lag features

In [36]:
def create_lag_features(df, lags=3):
    for lag in range(1, lags+1):
        df[f'returns_lag{lag}'] = df['returns'].shift(lag)
        df[f'sentiment_lag{lag}'] = df['fear_greed_score'].shift(lag)
    df.dropna(inplace=True)
    return df

In [37]:
df = pd.read_csv("outputs/bitcoin_merged_sentiment.csv")  # change path if needed
df['date'] = pd.to_datetime(df['date'])

# Drop non-numeric columns
X = df.drop(columns=['returns', 'date', 'sentiment_label'])  # also 'volatility' if that's the target
y = df['returns']
# STEP 3: Check types (optional debug)
print(X.dtypes)

# STEP 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

fear_greed_score    int64
dtype: object


### 3.Train Random Forest model

In [38]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("\nRandom Forest:")
print("MSE:", mean_squared_error(y_test, rf_preds))
print("MAE:", mean_absolute_error(y_test, rf_preds))
print("R2:", r2_score(y_test, rf_preds))


Random Forest:
MSE: 0.0006658987342820316
MAE: 0.018706336494837134
R2: -0.2743745633235064


### 4.Train XGBoost model

In [39]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print("\nXGBoost:")
print("MSE:", mean_squared_error(y_test, xgb_preds))
print("MAE:", mean_absolute_error(y_test, xgb_preds))
print("R2:", r2_score(y_test, xgb_preds))



XGBoost:
MSE: 0.0006577488942794492
MAE: 0.018769514198713946
R2: -0.25877767409733066


### 5.Core Function to Train Models and Save Plots

In [51]:
os.makedirs("../models", exist_ok=True)
os.makedirs("../results", exist_ok=True)

for coin, path in coin_info.items():
    print(f"\n\n=== {coin.upper()} ===")

    if not os.path.exists(path):
        print(f"❌ File not found for {coin}: {path}")
        continue

    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['date'])
    df = create_lag_features(df)

    # Create volatility if not present
    if 'volatility' not in df.columns:
        df['volatility'] = df['returns'].rolling(window=7).std()

    df.dropna(inplace=True)

    # Features & targets
    features = [col for col in df.columns if 'lag' in col]
    X = df[features]
    y_vol = df['volatility']
    y_ret = df['returns']

    # Train/test split
    X_train, X_test, y_vol_train, y_vol_test = train_test_split(X, y_vol, test_size=0.2, shuffle=False)
    _, _, y_ret_train, y_ret_test = train_test_split(X, y_ret, test_size=0.2, shuffle=False)

    # Train Volatility Models
    rf_vol_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_vol_model.fit(X_train, y_vol_train)
    rf_vol_preds = rf_vol_model.predict(X_test)
    rf_vol_r2 = r2_score(y_vol_test, rf_vol_preds)

    xgb_vol_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_vol_model.fit(X_train, y_vol_train)
    xgb_vol_preds = xgb_vol_model.predict(X_test)
    xgb_vol_r2 = r2_score(y_vol_test, xgb_vol_preds)

    # Save Volatility Models
    joblib.dump(rf_vol_model, f"../models/{coin}_rf_vol_model.pkl")
    joblib.dump(xgb_vol_model, f"../models/{coin}_xgb_vol_model.pkl")

    # Train Return Models
    rf_ret_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_ret_model.fit(X_train, y_ret_train)
    rf_ret_preds = rf_ret_model.predict(X_test)
    rf_ret_r2 = r2_score(y_ret_test, rf_ret_preds)

    xgb_ret_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_ret_model.fit(X_train, y_ret_train)
    xgb_ret_preds = xgb_ret_model.predict(X_test)
    xgb_ret_r2 = r2_score(y_ret_test, xgb_ret_preds)

    # Save Return Models
    joblib.dump(rf_ret_model, f"../models/{coin}_rf_ret_model.pkl")
    joblib.dump(xgb_ret_model, f"../models/{coin}_xgb_ret_model.pkl")

    # Plot Volatility
    fig1, ax1 = plt.subplots(figsize=(10, 4))
    ax1.plot(y_vol_test.values[:50], label='Actual', color='black')
    ax1.plot(rf_vol_preds[:50], label=f'RF (R²={rf_vol_r2:.2f})', linestyle='--', color='blue')
    ax1.plot(xgb_vol_preds[:50], label=f'XGB (R²={xgb_vol_r2:.2f})', linestyle='--', color='green')
    ax1.set_title(f"{coin.upper()} - Volatility Prediction")
    ax1.legend()
    ax1.grid(True)
    plt.tight_layout()
    fig1.savefig(f"../results/{coin}_volatility_pred.png")
    plt.close()

    # Plot Returns
    fig2, ax2 = plt.subplots(figsize=(10, 4))
    ax2.plot(y_ret_test.values[:50], label='Actual', color='black')
    ax2.plot(rf_ret_preds[:50], label=f'RF Returns (R²={rf_ret_r2:.2f})', linestyle='--', color='orange')
    ax2.plot(xgb_ret_preds[:50], label=f'XGB Returns (R²={xgb_ret_r2:.2f})', linestyle='--', color='purple')
    ax2.set_title(f"{coin.upper()} - Returns Prediction")
    ax2.legend()
    ax2.grid(True)
    plt.tight_layout()
    fig2.savefig(f"../results/{coin}_returns_pred.png")
    plt.close()

    print(f"✅ Models trained and plots saved for {coin.upper()}")



=== BITCOIN ===
✅ Models trained and plots saved for BITCOIN


=== ETHEREUM ===
✅ Models trained and plots saved for ETHEREUM


=== SOLANA ===
✅ Models trained and plots saved for SOLANA


=== CARDANO ===
✅ Models trained and plots saved for CARDANO


In [53]:
import joblib
model = joblib.load("../models/bitcoin_rf_ret_model.pkl")
print(type(model))  # should be RandomForestRegressor


<class 'sklearn.ensemble._forest.RandomForestRegressor'>


### 6.Save models

In [50]:
# Save models
os.makedirs("../models", exist_ok=True)
joblib.dump(rf_model, f"../models/{coin}_rf_vol_model.pkl")
joblib.dump(xgb_model, f"../models/{coin}_xgb_vol_model.pkl")
joblib.dump(rf_ret_model, f"../models/{coin}_rf_ret_model.pkl")
joblib.dump(xgb_ret_model, f"../models/{coin}_xgb_ret_model.pkl")

['../models/cardano_xgb_ret_model.pkl']