## In this notebook i will create and train the first prediction model - Random Forest

In [2]:
import pandas as pd

df = pd.read_csv(
    r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_with_trends.csv",
    index_col=0
)


In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error

from eli5.sklearn import PermutationImportance
import eli5

In [3]:
df["target"] = df["close"].pct_change().shift(-1)
df = df.dropna(subset=["target"])
df = df.dropna()

feature_cols = [c for c in df.columns if c not in ("timestamp", "target")]
X_all = df[feature_cols]
y = df["target"]

In [5]:
tscv = TimeSeriesSplit(n_splits=5, gap=0)

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

cv_scores = cross_val_score(
    rf, X_all, y,
    cv=tscv,
    scoring="neg_mean_absolute_error"
)

print("MAE per split:", -cv_scores.round(6))
print("Average MAE  :", -cv_scores.mean())

MAE per split: [0.010598 0.004553 0.011453 0.003795 0.003912]
Average MAE  : 0.0068620079288530325


In [4]:
import pkg_resources, sklearn, eli5, numpy, pandas
print("scikit‑learn:", sklearn.__version__)
print("eli5        :", eli5.__version__)
print("numpy       :", numpy.__version__)
print("pandas      :", pandas.__version__)


scikit‑learn: 1.1.3
eli5        : 0.13.0
numpy       : 1.26.4
pandas      : 2.2.2


In [15]:
!python -m pip uninstall -y eli5 scikit-learn scikit-learn-intelex


Found existing installation: eli5 0.13.0
Uninstalling eli5-0.13.0:
  Successfully uninstalled eli5-0.13.0
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2


You can safely remove it manually.


In [16]:
!pip cache purge


Files removed: 1575


In [None]:
!python -m pip install "scikit-learn==1.1.3" "eli5==0.13.0"


In [6]:
from eli5.sklearn import PermutationImportance
import eli5



In [None]:
# Fit the model
rf.fit(X_all, y)

# Permutation importance without `n_jobs`
perm = PermutationImportance(
    rf,
    scoring="neg_mean_absolute_error",
    random_state=42,
    n_iter=5
)
perm.fit(X_all, y)

# Show results
eli5.show_weights(
    perm,
    feature_names=feature_cols,
    top=len(feature_cols)
)


Weight,Feature
0.0021  ± 0.0000,roc_24h
0.0020  ± 0.0000,roc_7days
0.0018  ± 0.0000,roc_4h
0.0018  ± 0.0000,Volume BTC
0.0017  ± 0.0000,roc_30days
0.0009  ± 0.0000,close
0.0009  ± 0.0000,low
0.0009  ± 0.0000,high


In [12]:
from eli5 import explain_weights, format_as_text

# Show results (no colors)
print(format_as_text(
    explain_weights(
        perm,
        feature_names=feature_cols,
        top=len(feature_cols)
    )
))


Explained as: feature importances

Feature importances, computed as a decrease in score when feature
values are permuted (i.e. become noise). This is also known as 
permutation importance.

If feature importances are computed on the same data as used for training, 
they don't reflect importance of features for generalization. Use a held-out
dataset if you want generalization feature importances.

0.0021 ± 0.0000  roc_24h
0.0020 ± 0.0000  roc_7days
0.0018 ± 0.0000  roc_4h
0.0018 ± 0.0000  Volume BTC
0.0017 ± 0.0000  roc_30days
0.0009 ± 0.0000  close
0.0009 ± 0.0000  low
0.0009 ± 0.0000  high


In [None]:

# This code checks if our system can use GPU or CPU 
import importlib, warnings, platform, sys

# Default: assume CPU
GPU_AVAILABLE = False
GPU_MESSAGE   = "🔹  No compatible GPU / cuML detected → using CPU route."

try:
    cudf_spec = importlib.util.find_spec("cudf")
    cuml_spec = importlib.util.find_spec("cuml.ensemble")
    if cudf_spec is not None and cuml_spec is not None:
        import cudf
        from cuml.ensemble import RandomForestRegressor as cuRF
        GPU_AVAILABLE = True
        GPU_MESSAGE   = "🚀  cuML & compatible GPU found → using GPU‑accelerated route."
except Exception as e:
    warnings.warn(f"GPU check failed: {e}\nFalling back to CPU.")

print(GPU_MESSAGE)
print("Python:", sys.version.split()[0], " | Platform:", platform.platform())


🔹  No compatible GPU / cuML detected → using CPU route.
Python: 3.10.6  | Platform: Windows-10-10.0.19045-SP0


In [18]:

import pandas as pd
from pathlib import Path

CSV_PATH = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_with_trends.csv"

df = pd.read_csv(CSV_PATH)

# 1. Create next‑period %‑return target
df["target"] = df["close"].pct_change().shift(-1)

# 2. Handle NaNs
#    • forward‑fill feature NaNs that come from look‑backs
#    • then drop any remaining rows that still have NaNs
df = (
    df.fillna(method="ffill")  # optional; keep if you like forward‑fill
      .dropna()                # removes last-row NaN in target + any leftovers
      .reset_index(drop=True)
)

# 3. Feature list
FEATURE_COLS = [c for c in df.columns if c != "target"]

print(f"Data shape  : {df.shape}")
print(f"# features  : {len(FEATURE_COLS)}")


Data shape  : (81799, 10)
# features  : 9


  df.fillna(method="ffill")  # optional; keep if you like forward‑fill
