# Model Training Pipeline - 15-Step Workflow
## Coins: ADA, BNB, BTC, DOGE, ETH

This notebook implements the strict 15-step workflow requested:
1.  **Load Data** (Local CSVs)
2.  **EDA** (Info, Describe)
3.  **Null Checks**
4.  **IQR Outlier Cleaning**
5.  **Datetime Conversion**
6.  **Feature Extraction**
7.  **Drop NaNs**
8.  **Split X/y**
9.  **Train/Test Split**
10. **Normalization**
11. **Cross-Validation & Tuning**
12. **Train Models** (LR, RF, XGB, LSTM)
13. **Final Score**
14. **Save Models**

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

warnings.filterwarnings('ignore')

# Configuration
COINS = ['ADA', 'BNB', 'BTC', 'DOGE', 'ETH']
DATA_DIR = 'data'
MODELS_DIR = 'models'
os.makedirs(MODELS_DIR, exist_ok=True)

### Helper Functions

### Step 6: Feature Extraction
Calculate technical indicators (SMA, RSI, Volatility, Lags) and the target variable.

In [2]:
def remove_outliers_iqr(df, columns):
    """Step 4: Remove outliers using IQR method."""
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Filter
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

def process_features(df):
    """Step 6: Feature Extraction."""
    df = df.copy()
    # Ensure close is float
    df['close'] = df['close'].astype(float)
    
    # SMA & RSI
    df['SMA_20'] = df['close'].rolling(20).mean()
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss
    df['RSI_14'] = 100 - (100 / (1 + rs))
    
    # Lags
    for lag in [1, 2, 3, 7]:
        df[f'close_lag_{lag}'] = df['close'].shift(lag)
        
    # Volatility
    df['volatility_20'] = df['close'].rolling(20).std()
    
    # Target (Next Day Close)
    df['target'] = df['close'].shift(-1)
    
    return df

def evaluate_model(y_true, y_pred, model_name, horizon):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return {'Model': model_name, 'Horizon': horizon, 'R2': r2, 'RMSE': rmse, 'MAE': mae}

### Main Processing Loop

### Step 0: Initialization
Initialize the storage dictionary to hold data for all coins.

In [None]:
pipeline_data = {}
print("Initialized pipeline_data storage.")

### Step 1: Load Data
Load the pre-processed CSV files for each coin.

In [None]:
print(f"\n{'='*50}\nStep 1: Load Data\n{'='*50}")
for coin in COINS:
    print(f"Processing {coin}...")
    file_path = f"{DATA_DIR}/{coin}_ML_ready.csv"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows.")
    pipeline_data[coin] = {'df': df}

### Step 2: Exploratory Data Analysis (EDA)
Display basic information and statistics for the loaded data.

In [None]:
print(f"\n{'='*50}\nStep 2: EDA\n{'='*50}")
for coin, data in pipeline_data.items():
    print(f"\n--- {coin} EDA ---")
    df = data['df']
    print(df.info())
    print(df.describe().T[['mean', 'std', 'min', 'max']])

### Step 3: Null Value Checks
Check for any missing values in the datasets.

In [None]:
print(f"\n{'='*50}\nStep 3: Null Checks\n{'='*50}")
for coin, data in pipeline_data.items():
    print(f"\n--- {coin} Null Checks ---")
    df = data['df']
    print(df.isnull().sum()[df.isnull().sum() > 0])

### Step 5: Datetime Conversion
Convert the `open_time` column to datetime objects and sort by time.

In [None]:
print(f"\n{'='*50}\nStep 5: Datetime Conversion\n{'='*50}")
for coin, data in pipeline_data.items():
    df = data['df']
    if 'open_time' in df.columns:
        df['open_time'] = pd.to_datetime(df['open_time'])
        df = df.sort_values('open_time')
    pipeline_data[coin]['df'] = df

### Step 4: Outlier Cleaning (IQR)
Remove outliers from `close` and `volume` columns using the Interquartile Range method.

In [None]:
print(f"\n{'='*50}\nStep 4: IQR Cleaning\n{'='*50}")
for coin, data in pipeline_data.items():
    print(f"\n--- {coin} IQR Cleaning ---")
    df = data['df']
    initial_len = len(df)
    df = remove_outliers_iqr(df, ['close', 'volume'])
    print(f"Removed {initial_len - len(df)} outliers.")
    pipeline_data[coin]['df'] = df

### Step 6: Feature Extraction
Calculate technical indicators (SMA, RSI, Volatility, Lags) and the target variable.

In [None]:
print(f"\n{'='*50}\nStep 6: Feature Extraction\n{'='*50}")
for coin, data in pipeline_data.items():
    df = data['df']
    df = process_features(df)
    pipeline_data[coin]['df'] = df

### Step 7: Drop NaN Values
Remove rows with missing values created during feature extraction (e.g., rolling windows).

In [None]:
print(f"\n{'='*50}\nStep 7: Drop NaNs\n{'='*50}")
for coin, data in pipeline_data.items():
    df = data['df']
    df = df.dropna()
    print(f"{coin}: Rows after dropping NaNs: {len(df)}")
    pipeline_data[coin]['df'] = df

### Step 8: Split Features (X) and Target (y)
Separate the feature columns from the target variable.

In [None]:
print(f"\n{'='*50}\nStep 8: Split X/y\n{'='*50}")

feature_cols = ['SMA_20', 'RSI_14', 'volatility_20', 'close_lag_1', 'close_lag_2', 'close_lag_3', 'close_lag_7']

for coin, data in pipeline_data.items():

    df = data['df']

    pipeline_data[coin]['X'] = df[feature_cols]

    pipeline_data[coin]['y'] = df['target']


### Step 9: Train/Test Split
Split the data into training and testing sets using a time-series split (80/20).

In [None]:
print(f"\n{'='*50}\nStep 9: Train/Test Split\n{'='*50}")

for coin, data in pipeline_data.items():

    X = data['X']

    y = data['y']

    split_idx = int(len(X) * 0.8)

    pipeline_data[coin]['X_train_raw'] = X.iloc[:split_idx]

    pipeline_data[coin]['X_test_raw'] = X.iloc[split_idx:]

    pipeline_data[coin]['y_train'] = y.iloc[:split_idx]

    pipeline_data[coin]['y_test'] = y.iloc[split_idx:]


### Step 10: Normalization (MinMaxScaler)
Scale the features to a range of [0, 1] using MinMaxScaler.

In [None]:
print(f"\n{'='*50}\nStep 10: Normalization\n{'='*50}")
for coin, data in pipeline_data.items():
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(data['X_train_raw'])
    X_test = scaler.transform(data['X_test_raw'])
    
    pipeline_data[coin]['X_train'] = X_train
    pipeline_data[coin]['X_test'] = X_test
    
    os.makedirs(f'{MODELS_DIR}/{coin}', exist_ok=True)
    joblib.dump(scaler, f'{MODELS_DIR}/{coin}/scaler.pkl')

### Step 11 & 12: Cross-Validation & Model Training
Train a Linear Regression model for each coin. (Cross-validation setup is included but LR doesn't require tuning).

In [None]:
print(f"\n{'='*50}\nStep 11 & 12: Tuning & Training\n{'='*50}")

tscv = TimeSeriesSplit(n_splits=5)

for coin, data in pipeline_data.items():

    print(f"Training Linear Regression for {coin}...")

    lr = LinearRegression()

    lr.fit(data['X_train'], data['y_train'])

    pipeline_data[coin]['model'] = lr


### Step 13: Final Evaluation & Scoring
Evaluate the trained models on the test set and print the performance metrics (R2, MAE, MSE).

In [None]:
print(f"\n{'='*50}\nStep 13: Final Score & Evaluation\n{'='*50}")

print(f"\n{'Coin':<10} | {'Model':<20} | {'Train R2':<10} | {'Test R2':<10} | {'MAE':<10} | {'MSE':<10}")

print("-" * 90)



for coin, data in pipeline_data.items():

    model = data['model']

    X_train = data['X_train']

    X_test = data['X_test']

    y_train = data['y_train']

    y_test = data['y_test']

    

    train_pred = model.predict(X_train)

    test_pred = model.predict(X_test)

    

    joblib.dump(model, f'{MODELS_DIR}/{coin}/LinearRegression_model.pkl')

    

    train_r2 = r2_score(y_train, train_pred)

    test_r2 = r2_score(y_test, test_pred)

    mae = mean_absolute_error(y_test, test_pred)

    mse = mean_squared_error(y_test, test_pred)

    

    print(f"{coin:<10} | {'LinearRegression':<20} | {train_r2:<10.4f} | {test_r2:<10.4f} | {mae:<10.4f} | {mse:<10.4f}")
