In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#All the necessary imports
import os, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

#Ignoring the warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 200)
sns.set_style('darkgrid')

#initialising SEED value as 42
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

#This is the base path of the project ie. the root directory
BASE_PATH = "/kaggle/input/gq-implied-volatility-forecasting"   
TRAIN_DIR = os.path.join(BASE_PATH, "train")
TEST_DIR  = os.path.join(BASE_PATH, "test")
OUT_DIR = "/kaggle/working/submissions"
os.makedirs(OUT_DIR, exist_ok=True)

LABEL_FILE = None

#Initialising the sequence length,lstm epochs, batch size, TSCV splits and Holdout seconds
SEQ_LEN = 60         
LSTM_EPOCHS = 8     
BATCH_SIZE = 128
TSCV_SPLITS = 5
HOLDOUT_SECONDS = 3600

In [None]:
#Checking whether the base path exists
print("BASE_PATH exists:", Path(BASE_PATH).exists())
train_files = sorted([f for f in os.listdir(TRAIN_DIR) if f.lower().endswith('.csv')]) if Path(TRAIN_DIR).exists() else []
test_files  = sorted([f for f in os.listdir(TEST_DIR)  if f.lower().endswith('.csv')]) if Path(TEST_DIR).exists() else []

print("train files:", train_files)
print("test files: ", test_files)
coins = [os.path.splitext(f)[0] for f in train_files]
print("coins:", coins)

label_candidates = []

#Checking if the path exists then append to the label_candidates list
if LABEL_FILE and Path(LABEL_FILE).exists():
    label_candidates.append(LABEL_FILE)
else:
    for name in ['train_labels.csv','train_iv_labels.csv','labels.csv','submission.csv','train_labels_10s.csv']:
        p = os.path.join(BASE_PATH, name)
        if Path(p).exists():
            label_candidates.append(p)
            
#Reading the CSV files if they exist in the label_candidates list         
if label_candidates:
    LABEL_PATH = label_candidates[0]
    print("Using label file:", LABEL_PATH)
    global_labels = pd.read_csv(LABEL_PATH, parse_dates=['timestamp'], low_memory=False)
else:
    LABEL_PATH = None
    global_labels = None
    print("No global label file found. Models will expect iv column in train CSVs.")

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

#These are the train and test directories
TRAIN_DIR = "/kaggle/input/gq-implied-volatility-forecasting/train"
TEST_DIR = "/kaggle/input/gq-implied-volatility-forecasting/test"

coins = sorted([f.replace('.csv','') for f in os.listdir(TRAIN_DIR) if f.endswith('.csv')])

#performing EDA(exploratory data analysis)
def eda_for_coin(coin):
    print(f"\n--- EDA for coin: {coin} ---")
    train_path = os.path.join(TRAIN_DIR, f"{coin}.csv")
    test_path = os.path.join(TEST_DIR, f"{coin}.csv")
    
    #Reading the csv files inside the train and test
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)
    
    print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")
    print("Train columns:", df_train.columns.tolist())
    print("Missing values (train):\n", df_train.isnull().sum())
    print("Missing values (test):\n", df_test.isnull().sum())
    
    #Describing the train and test datasets
    print("Train describe:\n", df_train.describe())
    print("Test describe:\n", df_test.describe())
    
    #Converting timestamp to datetime feature
    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'], errors='coerce')
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'], errors='coerce')

    #Plotting the figures with timestamp, mid_price etc.
    plt.figure(figsize=(12,4))
    plt.plot(df_train['timestamp'].iloc[::max(1,len(df_train)//1000)], 
             df_train['mid_price'].iloc[::max(1,len(df_train)//1000)], label='Train mid_price')
    plt.plot(df_test['timestamp'].iloc[::max(1,len(df_test)//1000)], 
             df_test['mid_price'].iloc[::max(1,len(df_test)//1000)], label='Test mid_price')
    plt.title(f"{coin} mid_price over time")
    plt.xlabel("Timestamp")
    plt.ylabel("Mid Price")
    plt.legend()
    plt.show()
    
#Repeating the process for all coins(BTC,ETH,SHIB etc.)
for coin in coins:
    eda_for_coin(coin)

In [1]:
import pandas as pd

#Initialising the submission path
SUBMISSION_PATH = "/kaggle/input/gq-implied-volatility-forecasting/submission.csv"

#Reading the submission path which contains timestamp and labels
global_labels = pd.read_csv(SUBMISSION_PATH)
global_labels['timestamp'] = pd.to_datetime(global_labels['timestamp'], errors='coerce')
global_labels = global_labels.dropna(subset=['timestamp']).reset_index(drop=True)

#printing the global_labels sample
print("Loaded global_labels sample:")
print(global_labels.head())

Loaded global_labels sample:
                      timestamp    labels
0 1970-01-01 00:00:00.000000001  0.381921
1 1970-01-01 00:00:00.000000002  0.590922
2 1970-01-01 00:00:00.000000003  0.663046
3 1970-01-01 00:00:00.000000004  0.954191
4 1970-01-01 00:00:00.000000005  0.091550


In [None]:
import numpy as np

#Feature engineering step
def feature_engineering_coin(df, global_labels):
    
    #Creates a safe copy of the Dataframe
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    #Dropping the rows where timestamp failed to convert
    df = df.dropna(subset=['timestamp']).reset_index(drop=True)

    df = df.sort_values('timestamp').reset_index(drop=True)

    df['time_10s'] = df['timestamp'].dt.floor('10S')
    
    #Group each row by 10s and in each 10s window calculates open,high,low and close
    ohlcv = df.groupby('time_10s')['mid_price'].agg(
        open='first', high='max', low='min', close='last').reset_index()
    
    #Calculates bid_volume1 to bid_volume4 along with total bid_volume per row and group by 10s
    bid_volume_cols = [f'bid_volume{i}' for i in range(1,5) if f'bid_volume{i}' in df.columns]
    df['total_bid_volume'] = df[bid_volume_cols].sum(axis=1)
    vol = df.groupby('time_10s')['total_bid_volume'].sum().reset_index(name='volume')

    ohlcv = ohlcv.merge(vol, on='time_10s', how='left')

    #Calculating log returns from close price using the formula
    #Using 3 rolling windows each of size 10s to get short term volatility
    ohlcv['log_return'] = np.log(ohlcv['close'] / ohlcv['close'].shift(1)).fillna(0)
    ohlcv['rolling_vol_30'] = ohlcv['log_return'].rolling(window=3, min_periods=1).std().fillna(0)  

    #Calculating spread between the highest and lowest bid prices in each row
    bid_price_cols = [f'bid_price{i}' for i in range(1,5) if f'bid_price{i}' in df.columns]
    df['spread'] = df[bid_price_cols].max(axis=1) - df[bid_price_cols].min(axis=1)
    
    #Aggregating the mean spread per 10s
    spread_agg = df.groupby('time_10s')['spread'].mean().reset_index(name='avg_spread')
    ohlcv = ohlcv.merge(spread_agg, on='time_10s', how='left')

    if len(bid_volume_cols) >= 4:
        
        #Measures imbalance between top 2 vs next 2 bid volumes
        top2 = df[bid_volume_cols[:2]].sum(axis=1)
        next2 = df[bid_volume_cols[2:4]].sum(axis=1)
        
        #Normalizes the difference between the total volume
        df['depth_imbalance'] = (top2 - next2) / (top2 + next2 + 1e-9)

        #Aggregating per 10s
        depth_imbalance_agg = df.groupby('time_10s')['depth_imbalance'].mean().reset_index(name='avg_depth_imbalance')
        ohlcv = ohlcv.merge(depth_imbalance_agg, on='time_10s', how='left')
    else:
        ohlcv['avg_depth_imbalance'] = 0.0

   
    ohlcv['global_avg_mid_price'] = np.nan

    #Taking the global_labels file and shifting it backwards by 10s such that labels align with 10s earlier
    global_labels_shifted = global_labels.copy()
    global_labels_shifted['timestamp_shifted'] = global_labels_shifted['timestamp'] - pd.Timedelta(seconds=10)
    label_map = global_labels_shifted.set_index('timestamp_shifted')['labels'].to_dict()
    ohlcv['label_t10'] = ohlcv['time_10s'].map(label_map)

    #Returning the final engineered feature
    return ohlcv

In [None]:
import os
feature_dfs = {}

for coin in coins:
    print(f"Processing features for {coin} ...")
    
    #Reading the train and test csv files
    train_path = os.path.join(TRAIN_DIR, f"{coin}.csv")
    df_train = pd.read_csv(train_path)

    #Running OHLCV on the Dataframe along with global_labels as input and storing it in a dict with coin as key
    ohlcv_features = feature_engineering_coin(df_train, global_labels)
    feature_dfs[coin] = ohlcv_features

#Taking only the close price of each engineered coin and put them in Dataframe whose index is 'time_10s'
all_coins_ohlcv = pd.concat([df.set_index('time_10s')[['close']] for df in feature_dfs.values()], axis=1)
all_coins_ohlcv.columns = feature_dfs.keys()

#Calculating the global_avg_mid_price by taking the mean of all close price
global_avg_mid_price = all_coins_ohlcv.mean(axis=1).reset_index(name='global_avg_mid_price')

#Repeating the process for all coins(BTC,ETH,DOGE etc.) ie. looping through all coins
for coin in coins:
    df = feature_dfs[coin]
    df = df.merge(global_avg_mid_price, on='time_10s', how='left')
    feature_dfs[coin] = df

print("Feature engineering done for all coins.")

In [None]:
#All neccessary imports
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr

#Fixed random SEED to 42
SEED = 42
np.random.seed(SEED)
TRAIN_DIR = "/kaggle/input/gq-implied-volatility-forecasting/train"
TEST_DIR = "/kaggle/input/gq-implied-volatility-forecasting/test"
OUT_DIR = "/kaggle/working/submissions"
TSCV_SPLITS = 5
os.makedirs(OUT_DIR, exist_ok=True)

#Function for calculating pearson correlation
def pearson_corr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

coins = sorted([f.replace(".csv", "") for f in os.listdir(TRAIN_DIR) if f.endswith(".csv")])

#Creating volatality from mid_price changes using a rolling winndow of 10
#Creating our target which is shifted by 10s later
def create_volatility_proxy(df, window=10):
    df = df.copy
    df['mid_price'] = pd.to_numeric(df['mid_price'], errors='coerce').ffill().bfill()
    df['log_price'] = np.log(df['mid_price'])
    df['log_ret'] = df['log_price'].diff()
    df['vol_proxy'] = df['log_ret'].rolling(window).std().fillna(0)
    return df

#This is our feature_engineering step    
def feature_engineering(df):
    df = df.copy()
    for col in df.columns:
        if 'price' in col or 'volume' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce').ffill().bfill()

    #Calculating the mid_price as bid_price/ask_price
    df['mid_price'] = (df['bid_price1'] + df['ask_price1']) / 2
    
    #Calculating the weighted_mid
    df['weighted_mid'] = (
        df['bid_price1'] * df['bid_volume1'] + 
        df['ask_price1'] * df['ask_volume1']
    ) / (df['bid_volume1'] + df['ask_volume1'] + 1e-9)

    #Calculating the spread as ask_price-bid_price
    df['spread'] = df['ask_price1'] - df['bid_price1']
    df['spread_ratio'] = df['spread'] / df['mid_price']

    #Looping from 1 to 4(ie. from bid_price1 to bid_price4 and ask_price1 to ask_price4)
    for i in range(1, 4):
        if f'bid_price{i}' in df.columns and f'ask_price{i}' in df.columns:
            df[f'price_imbalance_{i}'] = df[f'bid_price{i}'] / (df[f'ask_price{i}'] + 1e-9)
            df[f'volume_imbalance_{i}'] = df[f'bid_volume{i}'] / (df[f'ask_volume{i}'] + 1e-9)

    #Taking multiple volatality windows ie. 5,10,30
    df = create_volatility_proxy(df)
    for w in [5, 10, 30]:
        df[f'vol_{w}'] = df['log_ret'].rolling(w).std().fillna(0)
    
    return df

submission_list = []

#Looping over each coin
for coin in coins:
    print(f"\n=== Processing {coin} ===")
    
    train_df = pd.read_csv(os.path.join(TRAIN_DIR, f"{coin}.csv"))
    test_df = pd.read_csv(os.path.join(TEST_DIR, f"{coin}.csv"))
    
    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], errors='coerce')
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], errors='coerce')
    train_df = train_df.dropna(subset=['timestamp']).sort_values('timestamp')
    test_df = test_df.dropna(subset=['timestamp']).sort_values('timestamp')
    
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)
    
    #Creating our target variable and predict future volatility ahead of 10s
    train_df['target'] = train_df['vol_proxy'].shift(-10)
    train_df = train_df.dropna(subset=['target'])

    #Excluding columns such as 'timestamp','target' etc. ensuring only numerical values
    exclude_cols = ['timestamp', 'target', 'vol_proxy', 'log_price', 'log_ret']
    feature_cols = [c for c in train_df.columns 
                   if c not in exclude_cols 
                   and c in test_df.columns
                   and pd.api.types.is_numeric_dtype(train_df[c])]

    #If numeric features not included then print and skip otherwise continue the loop
    if not feature_cols:
        print(f"  No valid features for {coin}, skipping...")
        continue
        
    X_train = train_df[feature_cols].fillna(0)
    y_train = train_df['target']
    X_test = test_df[feature_cols].fillna(0)

    models = []
    
    #Initialising time series cross validation
    tscv = TimeSeriesSplit(n_splits=TSCV_SPLITS)

    #Splitting into each fold(ie. 5)
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
        print(f"  Fold {fold+1}/{TSCV_SPLITS}")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        #Training our LightBGM model with leaves=31,learning rate=0.05,estimators=1000,SEED=42 and fold=5
        model = lgb.LGBMRegressor(
            objective='regression',
            num_leaves=31,
            learning_rate=0.05,
            n_estimators=1000,
            random_state=SEED + fold
        )

        #Fitting the model on our training and validation set using 'rmse' as evaluation metrics and early stopping as 50
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[
                lgb.early_stopping(50),
                lgb.log_evaluation(100)
            ]
        )

        #Appending our model to the models list
        models.append(model)

        #predicting the model on X_val(validation set)
        val_pred = model.predict(X_val)
        print(f"   Pearson: {pearson_corr(y_val, val_pred):.4f}")

    #Averaging(taking mean of the model predictions on our test set) across all folds
    test_preds = np.mean([model.predict(X_test) for model in models], axis=0)

    #Saving per coin and final submission
    coin_sub = pd.DataFrame({
        'timestamp': test_df['timestamp'],
        'predicted': test_preds
    })
    submission_list.append(coin_sub)
    
    coin_path = os.path.join(OUT_DIR, f"{coin}_submission.csv")
    coin_sub.to_csv(coin_path, index=False)
    print(f"Saved {coin} predictions to {coin_path}")

#Concatenating to the final submission_list group by sorted timestamp
#Joining the final_path to submission.csv
if submission_list:
    final_submission = pd.concat(submission_list).sort_values('timestamp')
    final_path = os.path.join(OUT_DIR, "submission.csv")
    final_submission.to_csv(final_path, index=False)
    print(f"\nFinal submission saved to {final_path}")
else:
    print("No submissions generated")

In [None]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

#Initialising SEED value to 42, TSCV_splits=5
SEED = 42
np.random.seed(SEED)
TRAIN_DIR = "/kaggle/input/gq-implied-volatility-forecasting/train"
TEST_DIR = "/kaggle/input/gq-implied-volatility-forecasting/test"
OUT_DIR = "/kaggle/working/submissions"
TSCV_SPLITS = 5
os.makedirs(OUT_DIR, exist_ok=True)

#Calculating pearson correlation
def pearson_corr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

#Evaluating model using pearson_corr,mse,r2,rmse,mae
def evaluate_model(y_true, y_pred, model_name="Model"):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corr = pearson_corr(y_true, y_pred)
    
    print(f"\n{model_name} Evaluation:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"Pearson Correlation: {corr:.4f}")

    #Plotting the graphs(Actual Volatility vs Predicted Volatility)
    #Plotting scatterplot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.6)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
    plt.xlabel('Actual Volatility')
    plt.ylabel('Predicted Volatility')
    plt.title(f'{model_name} - Predicted vs Actual Volatility')
    plt.show()

    #Calculating errors as true-predicted using 30 bins
    #Finally plotting the figure and title as 'Prediction Error'
    errors = y_true - y_pred
    plt.figure(figsize=(10, 6))
    sns.histplot(errors, kde=True, bins=30)
    plt.xlabel('Prediction Error')
    plt.title(f'{model_name} - Error Distribution')
    plt.show()

    #Returning metrics(mse,rmse,mae,r2,corr)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2, 'corr': corr}

#Plotting time_series_forecasting
#Plotting predicted vs actual volatility over time
def plot_time_series_forecast(dates, y_true, y_pred, model_name="Model"):
    plt.figure(figsize=(14, 6))
    plt.plot(dates, y_true, label='Actual Volatility')
    plt.plot(dates, y_pred, label='Predicted Volatility', alpha=0.7)
    plt.xlabel('Date')
    plt.ylabel('Implied Volatility')
    plt.title(f'{model_name} - Actual vs Predicted Volatility Over Time')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

#Calculates directional_accuracy(% of time model correctly predicts the correct movement)
def directional_accuracy(y_true, y_pred):
    true_direction = np.sign(np.diff(y_true))
    pred_direction = np.sign(np.diff(y_pred))
    return np.mean(true_direction == pred_direction) * 100

#Creating volatality from mid_price changes using a rolling winndow of 10
#Creating our target which is shifted by 10s later
def create_volatility_proxy(df, window=10):
    df = df.copy()
    df['mid_price'] = pd.to_numeric(df['mid_price'], errors='coerce').ffill().bfill()
    df['log_price'] = np.log(df['mid_price'])
    df['log_ret'] = df['log_price'].diff()
    df['vol_proxy'] = df['log_ret'].rolling(window).std().fillna(0)
    return df

#This is our feature_engineering step
def feature_engineering(df):
    df = df.copy()
    for col in df.columns:
        if 'price' in col or 'volume' in col:
            df[col] = pd.to_numeric(df[col], errors='coerce').ffill().bfill()

    #Calculating the mid_price as bid_price/ask_price
    df['mid_price'] = (df['bid_price1'] + df['ask_price1']) / 2

    #Calculating the weighted_mid
    df['weighted_mid'] = (
        df['bid_price1'] * df['bid_volume1'] + 
        df['ask_price1'] * df['ask_volume1']
    ) / (df['bid_volume1'] + df['ask_volume1'] + 1e-9)

    #Calculating the spread as ask_price-bid_price
    df['spread'] = df['ask_price1'] - df['bid_price1']
    df['spread_ratio'] = df['spread'] / df['mid_price']

    #Looping from 1 to 4(ie. from bid_price1 to bid_price4 and ask_price1 to ask_price4)
    for i in range(1, 4):
        if f'bid_price{i}' in df.columns and f'ask_price{i}' in df.columns:
            df[f'price_imbalance_{i}'] = df[f'bid_price{i}'] / (df[f'ask_price{i}'] + 1e-9)
            df[f'volume_imbalance_{i}'] = df[f'bid_volume{i}'] / (df[f'ask_volume{i}'] + 1e-9)
    
    df = create_volatility_proxy(df)

    #Taking multiple volatality windows ie. 5,10,30
    for w in [5, 10, 30]:
        df[f'vol_{w}'] = df['log_ret'].rolling(w).std().fillna(0)
    
    return df

coins = sorted([f.replace(".csv", "") for f in os.listdir(TRAIN_DIR) if f.endswith(".csv")])
submission_list = []
all_metrics = []

#Looping over each coin
for coin in coins:
    print(f"\n=== Processing {coin} ===")
    train_df = pd.read_csv(os.path.join(TRAIN_DIR, f"{coin}.csv"))
    test_df = pd.read_csv(os.path.join(TEST_DIR, f"{coin}.csv"))
    
    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], errors='coerce')
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], errors='coerce')
    train_df = train_df.dropna(subset=['timestamp']).sort_values('timestamp')
    test_df = test_df.dropna(subset=['timestamp']).sort_values('timestamp')

    #Performing feature_engineering on training Dataframe and testing Dataframe
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)

    #Creating our target variable and predict future volatility ahead of 10s
    train_df['target'] = train_df['vol_proxy'].shift(-10)
    train_df = train_df.dropna(subset=['target'])

    #Excluding columns such as 'timestamp','target' etc. ensuring only numerical values
    exclude_cols = ['timestamp', 'target', 'vol_proxy', 'log_price', 'log_ret']
    feature_cols = [c for c in train_df.columns 
                   if c not in exclude_cols 
                   and c in test_df.columns
                   and pd.api.types.is_numeric_dtype(train_df[c])]

    #If numeric features not included then print and skip otherwise continue the loop
    if not feature_cols:
        print(f"  No valid features for {coin}, skipping...")
        continue
        
    X_train = train_df[feature_cols].fillna(0)
    y_train = train_df['target']
    X_test = test_df[feature_cols].fillna(0)

    models = []
    fold_metrics = []
    
    #Initialising time series cross validation
    tscv = TimeSeriesSplit(n_splits=TSCV_SPLITS)

    #Splitting into each fold(ie. 5)
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
        print(f"  Fold {fold+1}/{TSCV_SPLITS}")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        #Training our LightBGM model with leaves=31,learning rate=0.05,estimators=1000,SEED=42 and fold=5
        model = lgb.LGBMRegressor(
            objective='regression',
            num_leaves=31,
            learning_rate=0.05,
            n_estimators=1000,
            random_state=SEED + fold,
            n_jobs=-1
        )

        #Fitting the model on our training and validation set using 'rmse' as evaluation metrics and early stopping as 50
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[
                lgb.early_stopping(50),
                lgb.log_evaluation(100)
            ]
        )

        #Appending our model to the models list
        models.append(model)

        #predicting the model on X_val(validation set)
        val_pred = model.predict(X_val)
        fold_metric = evaluate_model(y_val, val_pred, f"{coin} - Fold {fold+1}")
        fold_metric['fold'] = fold+1
        fold_metrics.append(fold_metric)

    #Calculating avg_metrics(RMSE,MAE,R^2,Pearson) across each fold
    fold_df = pd.DataFrame(fold_metrics)
    avg_metrics = fold_df.mean().to_dict()
    avg_metrics['coin'] = coin
    all_metrics.append(avg_metrics)
    
    print(f"\n{coin} Average CV Performance:")
    print(f"RMSE: {avg_metrics['rmse']:.4f}")
    print(f"MAE: {avg_metrics['mae']:.4f}")
    print(f"R²: {avg_metrics['r2']:.4f}")
    print(f"Pearson: {avg_metrics['corr']:.4f}")

    #Plotting figures of the result obtained across each fold(ie. 1 to 5)
    plt.figure(figsize=(10, 8))
    lgb.plot_importance(models[0], max_num_features=20)
    plt.title(f'{coin} - Feature Importance')
    plt.show()

    #Averaging(taking mean of the model predictions on our test set) across all folds
    test_preds = np.mean([model.predict(X_test) for model in models], axis=0)
    
    last_val_pred = models[-1].predict(X_val)
    evaluate_model(y_val, last_val_pred, f"{coin} - Final Validation")
    plot_time_series_forecast(train_df.iloc[val_idx]['timestamp'], y_val, last_val_pred, f"{coin} - Validation")

    #Saving per coin and final submission
    coin_sub = pd.DataFrame({
        'timestamp': test_df['timestamp'],
        'predicted': test_preds
    })
    submission_list.append(coin_sub)
    coin_path = os.path.join(OUT_DIR, f"{coin}_submission.csv")
    coin_sub.to_csv(coin_path, index=False)
    print(f"Saved {coin} predictions to {coin_path}")

#Concatenating to the final submission_list group by sorted timestamp
#Joining the final_path to submission.csv
#Simillarly concatenating final results(plots, metrices) and saving as performance_metrics.csv
#Finally printing overall performance of each coin
if submission_list:
    final_submission = pd.concat(submission_list).sort_values('timestamp')
    final_path = os.path.join(OUT_DIR, "submission.csv")
    final_submission.to_csv(final_path, index=False)
    print(f"\nFinal submission saved to {final_path}")
    
    metrics_df = pd.DataFrame(all_metrics)
    metrics_path = os.path.join(OUT_DIR, "performance_metrics.csv")
    metrics_df.to_csv(metrics_path, index=False)
    print(f"Performance metrics saved to {metrics_path}")
    
    print("\nOverall Performance Across All Coins:")
    print(metrics_df[['coin', 'rmse', 'mae', 'r2', 'corr']].to_string(index=False))
else:
    print("No submissions generated")