#### Step 5: Train/Test Split**

In [1]:
# Read processed excel
import pandas as pd
df = pd.read_csv(r"D:\DATA SCIENCE\SCHOOL PROJECTS\CRYPTO TRADER\data\processed\crypto_labeled_features.csv")



## MAY ADD FEATURE COLS???**

In [2]:
# Define features and target

target_col = "label"  # column with your Buy/Sell/Hold labels
feature_cols = df.columns.difference([target_col, "future_return", "open_time", "close_time"])

X = df[feature_cols]
y = df[target_col]


# Train/Validation/Test split

train_size = int(len(df) * 0.70)
val_size = int(len(df) * 0.15)
test_start = train_size + val_size

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_val = X.iloc[train_size:test_start]
y_val = y.iloc[train_size:test_start]

X_test = X.iloc[test_start:]
y_test = y.iloc[test_start:]


# Print summary

print(f"Train: {len(X_train)} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"Validation: {len(X_val)} samples ({len(X_val)/len(df)*100:.1f}%)")
print(f"Test: {len(X_test)} samples ({len(X_test)/len(df)*100:.1f}%)")


Train: 560 samples (69.9%)
Validation: 120 samples (15.0%)
Test: 121 samples (15.1%)


In [3]:
# Extract test-period prices for backtesting
test_prices = df.iloc[test_start:]["close"]


#### Step 6: Model Training

In [4]:
# Random Forest\
# LightGBM\
#  XGBoost\
#   CatBoost\
#   LSTM/GRU (bonus)

#Example:

#``` python
#from xgboost import XGBClassifier

# model = XGBClassifier()
# model.fit(X_train, y_train)

In [5]:
# Backtesting engine
INITIAL_CAPITAL = 10_000

def backtest(prices, signals):
    cash = INITIAL_CAPITAL
    position = 0

    for i in range(len(signals)):
        price = prices.iloc[i]

        if signals[i] == 2 and cash > 0:        # BUY
            position = cash / price
            cash = 0
        elif signals[i] == 0 and position > 0:  # SELL
            cash = position * price
            position = 0

    return cash + position * prices.iloc[-1]


##### Logistic Regression

In [6]:
# ================================
# 1. Imports
# ================================
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix




# ================================
# 4. Train / Validation / Test split (TIME-BASED)
# ================================
train_size = int(len(df) * 0.70)
val_size = int(len(df) * 0.15)
test_start = train_size + val_size

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test = X.iloc[test_start:]
y_test = y.iloc[test_start:]

# Prices needed for backtesting
test_prices = df.iloc[test_start:]["close"]

print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# ================================
# 5. Backtesting engine
# ================================
INITIAL_CAPITAL = 10_000

def backtest(prices, signals):
    cash = INITIAL_CAPITAL
    position = 0

    for i in range(len(signals)):
        price = prices.iloc[i]

        if signals[i] == 2 and cash > 0:        # BUY
            position = cash / price
            cash = 0
        elif signals[i] == 0 and position > 0:  # SELL
            cash = position * price
            position = 0

    return cash + position * prices.iloc[-1]

# ================================
# 6. Train Logistic Regression
# ================================
log_reg = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial"
)

log_reg.fit(X_train, y_train)

# ================================
# 7. Predict
# ================================
preds_lr = log_reg.predict(X_test)

# ================================
# 8. Evaluation
# ================================
print("\n--- Classification Metrics ---")
print("Accuracy:", accuracy_score(y_test, preds_lr))
print("Macro F1:", f1_score(y_test, preds_lr, average="macro"))
print(classification_report(y_test, preds_lr, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_lr))

# ================================
# 9. Backtesting
# ================================
final_lr = backtest(test_prices, preds_lr)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n--- Backtest Results ---")
print("Logistic Regression Final Value:", round(final_lr, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))


Train samples: 560
Test samples: 121





--- Classification Metrics ---
Accuracy: 0.71900826446281
Macro F1: 0.3199838187702265
              precision    recall  f1-score   support

        SELL       0.00      0.00      0.00        20
        HOLD       0.72      0.99      0.83        87
         BUY       0.50      0.07      0.12        14

    accuracy                           0.72       121
   macro avg       0.41      0.35      0.32       121
weighted avg       0.58      0.72      0.61       121

Confusion Matrix:
 [[ 0 20  0]
 [ 0 86  1]
 [ 0 13  1]]

--- Backtest Results ---
Logistic Regression Final Value: 7690.65
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 8070.81


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Random Forest Model

In [7]:
# ================================
# 1. Imports
# ================================
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# ================================
# 2. Train Random Forest
# ================================
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# ================================
# 3. Predict (TEST SET ONLY — for fair comparison)
# ================================
rf_preds = rf_model.predict(X_test)

# ================================
# 4. Classification Evaluation
# ================================
print("\n" + "="*60)
print("RANDOM FOREST — TEST SET METRICS")
print("="*60)

print("Accuracy:", accuracy_score(y_test, rf_preds))
print("Macro F1:", f1_score(y_test, rf_preds, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, rf_preds, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))

# ================================
# 5. Backtesting
# ================================
final_rf = backtest(test_prices, rf_preds)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n" + "="*60)
print("RANDOM FOREST — BACKTEST RESULTS")
print("="*60)
print("Random Forest Final Value:", round(final_rf, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))



RANDOM FOREST — TEST SET METRICS
Accuracy: 0.4132231404958678
Macro F1: 0.23984468339307052

Classification Report:
              precision    recall  f1-score   support

        SELL       0.10      0.25      0.14        20
        HOLD       0.66      0.52      0.58        87
         BUY       0.00      0.00      0.00        14

    accuracy                           0.41       121
   macro avg       0.25      0.26      0.24       121
weighted avg       0.49      0.41      0.44       121

Confusion Matrix:
 [[ 5 15  0]
 [41 45  1]
 [ 6  8  0]]

RANDOM FOREST — BACKTEST RESULTS
Random Forest Final Value: 10188.12
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 8456.5


#### XG Boost

In [8]:
# ================================
# 1. Imports
# ================================
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ================================
# 2. Train XGBoost
# ================================
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    objective="multi:softmax",
    num_class=3,
    eval_metric="mlogloss"
)

xgb.fit(X_train, y_train)

# ================================
# 3. Predict on Test Set
# ================================
preds_xgb = xgb.predict(X_test)

# ================================
# 4. Evaluation
# ================================
print("\n" + "="*60)
print("XGBOOST — TEST SET METRICS")
print("="*60)

print("Accuracy:", accuracy_score(y_test, preds_xgb))
print("Macro F1:", f1_score(y_test, preds_xgb, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, preds_xgb, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_xgb))

# ================================
# 5. Backtesting
# ================================
final_xgb = backtest(test_prices, preds_xgb)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n" + "="*60)
print("XGBOOST — BACKTEST RESULTS")
print("="*60)
print("XGBoost Final Value:", round(final_xgb, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))



XGBOOST — TEST SET METRICS
Accuracy: 0.256198347107438
Macro F1: 0.21313388140237874

Classification Report:
              precision    recall  f1-score   support

        SELL       0.12      0.55      0.20        20
        HOLD       0.68      0.22      0.33        87
         BUY       0.20      0.07      0.11        14

    accuracy                           0.26       121
   macro avg       0.33      0.28      0.21       121
weighted avg       0.53      0.26      0.28       121

Confusion Matrix:
 [[11  8  1]
 [65 19  3]
 [12  1  1]]

XGBOOST — BACKTEST RESULTS
XGBoost Final Value: 9405.28
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 7881.56


#### Catboost

In [9]:
# ================================
# 1. Imports
# ================================
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ================================
# 2. Train CatBoost
# ================================
cat = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.05,
    loss_function="MultiClass",
    verbose=False
)

cat.fit(X_train, y_train)

# ================================
# 3. Predict on Test Set
# ================================
preds_cat = cat.predict(X_test).flatten()  # flatten needed to match shape

# ================================
# 4. Evaluation
# ================================
print("\n" + "="*60)
print("CATBOOST — TEST SET METRICS")
print("="*60)

print("Accuracy:", accuracy_score(y_test, preds_cat))
print("Macro F1:", f1_score(y_test, preds_cat, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, preds_cat, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_cat))

# ================================
# 5. Backtesting
# ================================
final_cat = backtest(test_prices, preds_cat)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n" + "="*60)
print("CATBOOST — BACKTEST RESULTS")
print("="*60)
print("CatBoost Final Value:", round(final_cat, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))



CATBOOST — TEST SET METRICS
Accuracy: 0.5206611570247934
Macro F1: 0.2883206537052691

Classification Report:
              precision    recall  f1-score   support

        SELL       0.14      0.25      0.18        20
        HOLD       0.71      0.67      0.69        87
         BUY       0.00      0.00      0.00        14

    accuracy                           0.52       121
   macro avg       0.28      0.31      0.29       121
weighted avg       0.53      0.52      0.52       121

Confusion Matrix:
 [[ 5 14  1]
 [27 58  2]
 [ 4 10  0]]

CATBOOST — BACKTEST RESULTS
CatBoost Final Value: 9405.28
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 8081.06


#### LightGBM Model

In [10]:
# ================================
# 1. Imports
# ================================
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ================================
# 2. Train LightGBM
# ================================
lgbm_model = LGBMClassifier(
    n_estimators=200,
    max_depth=15,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss'
)

# ================================
# 3. Predict on Test Set
# ================================
preds_lgbm = lgbm_model.predict(X_test)

# ================================
# 4. Evaluation
# ================================
print("\n" + "="*60)
print("LIGHTGBM — TEST SET METRICS")
print("="*60)

print("Accuracy:", accuracy_score(y_test, preds_lgbm))
print("Macro F1:", f1_score(y_test, preds_lgbm, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, preds_lgbm, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds_lgbm))

# ================================
# 5. Backtesting
# ================================
final_lgbm = backtest(test_prices, preds_lgbm)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n" + "="*60)
print("LIGHTGBM — BACKTEST RESULTS")
print("="*60)
print("LightGBM Final Value:", round(final_lgbm, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))



LIGHTGBM — TEST SET METRICS
Accuracy: 0.38016528925619836
Macro F1: 0.31730237700386954

Classification Report:
              precision    recall  f1-score   support

        SELL       0.15      0.45      0.22        20
        HOLD       0.72      0.39      0.51        87
         BUY       0.23      0.21      0.22        14

    accuracy                           0.38       121
   macro avg       0.37      0.35      0.32       121
weighted avg       0.57      0.38      0.43       121

Confusion Matrix:
 [[ 9 10  1]
 [44 34  9]
 [ 8  3  3]]

LIGHTGBM — BACKTEST RESULTS
LightGBM Final Value: 9474.5
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 8556.88


#### LSTM Model

In [11]:
#!pip install torch torchvision torchaudio


#### LSTM MODEL FOR PRICE DIRECTION PREDICTION

In [12]:
# ================================
# PyTorch LSTM for tabular features
# ================================
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np

# ================================
# 1. Convert tabular X_train/X_test to "sequence" format
# ================================
# For tabular data, we can treat each row as a "sequence of length 1"
X_train_t = torch.tensor(X_train.values[:, np.newaxis, :], dtype=torch.float32)  # shape: (samples, seq_len=1, features)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)

X_test_t = torch.tensor(X_test.values[:, np.newaxis, :], dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# ================================
# 2. Define LSTM model
# ================================
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, output_dim)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # take last time step
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = LSTMClassifier(input_dim=X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ================================
# 3. Train LSTM
# ================================
epochs = 12
for epoch in range(epochs):
    model.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# ================================
# 4. Predict
# ================================
model.eval()
with torch.no_grad():
    logits = model(X_test_t)
    y_pred = torch.argmax(logits, dim=1).numpy()

# ================================
# 5. Evaluation
# ================================
print("\n" + "="*60)
print("PYTORCH LSTM — TEST SET METRICS")
print("="*60)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["SELL", "HOLD", "BUY"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ================================
# 6. Backtesting
# ================================
final_lstm = backtest(test_prices, y_pred)

buy_hold = INITIAL_CAPITAL * (test_prices.iloc[-1] / test_prices.iloc[0])
random_signals = np.random.choice([0, 1, 2], size=len(test_prices))
random_final = backtest(test_prices, random_signals)

print("\n" + "="*60)
print("PYTORCH LSTM — BACKTEST RESULTS")
print("="*60)
print("LSTM Final Value:", round(final_lstm, 2))
print("Buy & Hold Final Value:", round(buy_hold, 2))
print("Random Strategy Final Value:", round(random_final, 2))


Epoch 1/12, Loss: 1.1483
Epoch 2/12, Loss: 1.0045
Epoch 3/12, Loss: 0.9479
Epoch 4/12, Loss: 0.9375
Epoch 5/12, Loss: 0.8525
Epoch 6/12, Loss: 1.0787
Epoch 7/12, Loss: 1.0045
Epoch 8/12, Loss: 0.9231
Epoch 9/12, Loss: 0.9236
Epoch 10/12, Loss: 1.0963
Epoch 11/12, Loss: 0.8971
Epoch 12/12, Loss: 0.6038

PYTORCH LSTM — TEST SET METRICS
Accuracy: 0.71900826446281
Macro F1: 0.27884615384615385

Classification Report:
              precision    recall  f1-score   support

        SELL       0.00      0.00      0.00        20
        HOLD       0.72      1.00      0.84        87
         BUY       0.00      0.00      0.00        14

    accuracy                           0.72       121
   macro avg       0.24      0.33      0.28       121
weighted avg       0.52      0.72      0.60       121

Confusion Matrix:
 [[ 0 20  0]
 [ 0 87  0]
 [ 0 14  0]]

PYTORCH LSTM — BACKTEST RESULTS
LSTM Final Value: 10000.0
Buy & Hold Final Value: 7594.07
Random Strategy Final Value: 9099.46


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#### Step 8 : Serialise the model

In [37]:
# src/train.py or notebook cell after training
import os
import joblib

# -------------------------------
# 1. Determine project root
# -------------------------------
try:
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
except NameError:
    # for notebooks
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# -------------------------------
# 2. Make sure the models folder exists
# -------------------------------
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

# -------------------------------
# 3. Path to save the model
# -------------------------------
model_path = os.path.join(models_dir, "buy_sell_classifier.pkl")

# -------------------------------
# 4. Save the trained model
# -------------------------------
# 'model' is your trained scikit-learn model
joblib.dump(model, model_path)
print(f"Model saved successfully at: {model_path}")


Model saved successfully at: d:\DATA SCIENCE\SCHOOL PROJECTS\CRYPTO TRADER\models\buy_sell_classifier.pkl


#### Step 9: Prediction pipeline

In [41]:
def predict(features):
    model = joblib.load("models/buy_sell_classifier.pkl")
    return model.predict(features)