## In this notebook i will create and train the first prediction model

In [None]:
#this code reads the dataframe from the saved csv
import pandas as pd
df = pd.read_csv(
    r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version.csv",
    index_col=0,
    parse_dates=True
)
#print(df)


In [4]:
#data quality checker
import pandas as pd
import numpy as np

def check_data_problems(df):
    problems_summary = {}

    # 1) Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        print(">> MISSING VALUES found per column:")
        print(missing[missing > 0])
        problems_summary['missing_values'] = missing[missing > 0].to_dict()
    else:
        print("No missing values detected.")
        problems_summary['missing_values'] = {}
    
    # 2) Check for duplicate rows
    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f">> DUPLICATE ROWS found: {duplicate_count}")
        problems_summary['duplicate_rows'] = duplicate_count
    else:
        print("No duplicate rows found.")
        problems_summary['duplicate_rows'] = 0
    
    # 3) Negative or zero price/volume checks
    price_cols = [col for col in ['open', 'high', 'low', 'close'] if col in df.columns]
    volume_cols = [col for col in ['Volume BTC', 'Volume USDT', 'Volume USD', 'volume'] if col in df.columns]

    invalid_prices = {}
    for col in price_cols:
        non_pos = (df[col] <= 0).sum()
        if non_pos > 0:
            invalid_prices[col] = non_pos
    
    invalid_volumes = {}
    for col in volume_cols:
        non_pos = (df[col] <= 0).sum()
        if non_pos > 0:
            invalid_volumes[col] = non_pos

    if invalid_prices:
        print(">> INVALID (≤0) PRICE VALUES found:")
        for c, count in invalid_prices.items():
            print(f"   Column '{c}': {count} rows")
        problems_summary['invalid_prices'] = invalid_prices
    else:
        print("No invalid (zero/negative) price values found.")
        problems_summary['invalid_prices'] = {}
    
    if invalid_volumes:
        print(">> INVALID (≤0) VOLUME VALUES found:")
        for c, count in invalid_volumes.items():
            print(f"   Column '{c}': {count} rows")
        problems_summary['invalid_volumes'] = invalid_volumes
    else:
        print("No invalid (zero/negative) volume values found.")
        problems_summary['invalid_volumes'] = {}

    # 4) Out-of-order date index check (only if index is datetime-like)
    if isinstance(df.index, pd.DatetimeIndex):
        if not df.index.is_monotonic_increasing:
            print(">> The date index is NOT strictly increasing. Some timestamps may be out of order.")
            problems_summary['date_order'] = "Not strictly increasing"
        else:
            print("Date index is in ascending order (strictly increasing).")
            problems_summary['date_order'] = "Ascending"
    else:
        print("Index is not a DatetimeIndex (skipping date-order check).")
        problems_summary['date_order'] = None

    # 5) Data-type checks for numeric columns
    numeric_checks = {}
    for col in price_cols + volume_cols:
        if col in df.columns and not pd.api.types.is_numeric_dtype(df[col]):
            numeric_checks[col] = "Non-numeric type"
    if numeric_checks:
        print(">> NON-NUMERIC COLUMNS found (expected numeric):")
        for c, msg in numeric_checks.items():
            print(f"   Column '{c}' => {msg}")
    else:
        print("All price/volume columns have numeric types.")
    problems_summary['non_numeric_columns'] = numeric_checks

    print("\n=== DATA QUALITY CHECK COMPLETE ===\n")
    return problems_summary


In [None]:
# 3. Run the data-quality checker using your df_prices DataFrame
problems_report = check_data_problems(df)

# 4. If you want to do something programmatic with the results:
print("Problems Summary (as dict):")
print(problems_report)

# Additionally, display the rows where invalid (≤0) PRICE VALUES are found
if 'open' in df.columns:
    invalid_open = df[df['open'] <= 0]
    print("\nRows with invalid 'open' values (≤0):")
    print(invalid_open)

if 'low' in df.columns:
    invalid_low = df[df['low'] <= 0]
    print("\nRows with invalid 'low' values (≤0):")
    print(invalid_low)
    


In [None]:
#This code adds a column for the next hour price ,
#  in order for us to do the prediction calculation
df["next_hour"] = df["close"].shift(-1)

#When you create df["next_hour"] = df["close"].shift(-1), you are explicitly telling your model
# (or your data pipeline) that your prediction target is the close price of the next row (i.e., the next hour).

In [None]:
df

In [9]:
# This code checks if the price went up from a ceratin hour to its next hour
# and the  result type was converted to int for our ML model.
df["Target"] = (df["next_hour"] > df["close"]).astype(int)

In [None]:
#This code removes all the data that came before a certain date 
#(I used a copy of our dataframe just to check that it works) , the code for our df is 
# df=df.loc["2016-01-30":].copy()
#dfcopy=df.loc["2016-01-30":].copy()
#print(dfcopy)

### Below is a sample code snippet that uses scikit-learn’s metrices to search for the best combination of n_estimators and min_samples_split

In [None]:
'''
n_estimators:
The number of trees in the forest. Increasing this number can improve performance (by reducing variance) up to a certain point, but it also increases computation time.

min_samples_split:
The minimum number of samples required to split an internal node. Higher values make the trees shallower (simpler), which can help prevent overfitting.
'''

import time
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid
n_estimators_list = [100, 200, 300]
min_samples_split_list = [2, 5, 10, 20]

print("Starting manual hyperparameter evaluation:\n")

# Loop through each combination one-by-one
for n_est in n_estimators_list:
    for min_split in min_samples_split_list:
        print(f"Evaluating combination: n_estimators = {n_est}, min_samples_split = {min_split}")
        model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)
        
        # Start timing the training and evaluation
        start_time = time.time()
        model.fit(train[predictors], train["Target"])
        elapsed_time = time.time() - start_time
        
        # Evaluate the model on the test set
        test_pred = model.predict(test[predictors])
        
        # Calculate metrics
        acc = accuracy_score(test["Target"], test_pred)
        prec = precision_score(test["Target"], test_pred)
        rec = recall_score(test["Target"], test_pred)
        f1 = f1_score(test["Target"], test_pred)
        
        # For ROC-AUC, we need probability estimates
        test_proba = model.predict_proba(test[predictors])[:, 1]
        roc_auc = roc_auc_score(test["Target"], test_proba)
        
        conf_matrix = confusion_matrix(test["Target"], test_pred)
        class_report = classification_report(test["Target"], test_pred)
        
        # Print out the metrics
        print(f"Accuracy:      {acc:.4f}")
        print(f"Precision:     {prec:.4f}")
        print(f"Recall:        {rec:.4f}")
        print(f"F1 Score:      {f1:.4f}")
        print(f"ROC-AUC Score: {roc_auc:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("Classification Report:")
        print(class_report)
        print(f"Training & evaluation time: {elapsed_time:.2f} seconds")
        print("-" * 50)

'''
 Accuracy:
Overall, how many predictions are correct.

Precision:
Of the instances predicted as positive (price went up), how many were actually positive. This is critical if false positives are costly.

Recall (Sensitivity):
Of all the actual positive instances, how many did the model correctly capture. This is key if missing a positive event is expensive.

F1-Score:
The harmonic mean of precision and recall. It balances the two, which is especially useful when there’s an imbalance between classes.

ROC-AUC Score:
Provides an aggregate measure of performance across all classification thresholds, indicating how well the model distinguishes between classes.

Confusion Matrix:
Gives you a breakdown of true positives, false positives, true negatives, and false negatives. This is useful for understanding the types of errors your model makes. '
'''


In [None]:

import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid for this evaluation
n_estimators_list = [100, 200, 300]
min_samples_split_list = [50, 100]

print("Starting manual hyperparameter evaluation:\n")

# Loop through each combination one by one
for n_est in n_estimators_list:
    for min_split in min_samples_split_list:
        print(f"Evaluating combination: n_estimators = {n_est}, min_samples_split = {min_split}")
        model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)
        start_time = time.time()
        
        # Train the model
        model.fit(train[predictors], train["Target"])
        elapsed_time = time.time() - start_time
        
        # Evaluate the model on the test set
        test_pred = model.predict(test[predictors])
        acc = accuracy_score(test["Target"], test_pred)
        prec = precision_score(test["Target"], test_pred)
        rec = recall_score(test["Target"], test_pred)
        f1 = f1_score(test["Target"], test_pred)
        test_proba = model.predict_proba(test[predictors])[:, 1]
        roc_auc = roc_auc_score(test["Target"], test_proba)
        conf_matrix = confusion_matrix(test["Target"], test_pred)
        class_report = classification_report(test["Target"], test_pred)
        
        # Print out the metrics
        print(f"Accuracy:      {acc:.4f}")
        print(f"Precision:     {prec:.4f}")
        print(f"Recall:        {rec:.4f}")
        print(f"F1 Score:      {f1:.4f}")
        print(f"ROC-AUC Score: {roc_auc:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("Classification Report:")
        print(class_report)
        print(f"Training & evaluation time: {elapsed_time:.2f} seconds")
        print("-" * 50)


In [None]:
# finlizaiton of the data accuracy mesarmuments table
import pandas as pd

data = [
    # First batch
    {"n_estimators":100, "min_samples_split": 2,  "Accuracy":0.5016, "Precision":0.5163, "Recall":0.3598, "F1":0.4241, "ROC-AUC":0.5037},
    {"n_estimators":100, "min_samples_split": 5,  "Accuracy":0.5032, "Precision":0.5185, "Recall":0.3605, "F1":0.4253, "ROC-AUC":0.5075},
    {"n_estimators":100, "min_samples_split":10,  "Accuracy":0.4959, "Precision":0.5090, "Recall":0.3241, "F1":0.3961, "ROC-AUC":0.5053},
    {"n_estimators":100, "min_samples_split":20,  "Accuracy":0.4990, "Precision":0.5136, "Recall":0.3302, "F1":0.4020, "ROC-AUC":0.5041},

    {"n_estimators":200, "min_samples_split": 2,  "Accuracy":0.4991, "Precision":0.5127, "Recall":0.3580, "F1":0.4216, "ROC-AUC":0.5039},
    {"n_estimators":200, "min_samples_split": 5,  "Accuracy":0.5024, "Precision":0.5173, "Recall":0.3601, "F1":0.4247, "ROC-AUC":0.5063},
    {"n_estimators":200, "min_samples_split":10,  "Accuracy":0.4968, "Precision":0.5104, "Recall":0.3249, "F1":0.3970, "ROC-AUC":0.5052},
    {"n_estimators":200, "min_samples_split":20,  "Accuracy":0.4988, "Precision":0.5135, "Recall":0.3262, "F1":0.3989, "ROC-AUC":0.5045},

    {"n_estimators":300, "min_samples_split": 2,  "Accuracy":0.4987, "Precision":0.5122, "Recall":0.3571, "F1":0.4208, "ROC-AUC":0.5049},
    {"n_estimators":300, "min_samples_split": 5,  "Accuracy":0.5015, "Precision":0.5160, "Recall":0.3596, "F1":0.4238, "ROC-AUC":0.5063},
    {"n_estimators":300, "min_samples_split":10, "Accuracy":0.4972, "Precision":0.5109, "Recall":0.3258, "F1":0.3979, "ROC-AUC":0.5060},
    {"n_estimators":300, "min_samples_split":20, "Accuracy":0.4988, "Precision":0.5135, "Recall":0.3270, "F1":0.3996, "ROC-AUC":0.5051},

    # Second batch
    {"n_estimators":100, "min_samples_split": 50, "Accuracy":0.4973, "Precision":0.5117, "Recall":0.3131, "F1":0.3885, "ROC-AUC":0.5030},
    {"n_estimators":100, "min_samples_split":100,"Accuracy":0.4990, "Precision":0.5155, "Recall":0.2930, "F1":0.3736, "ROC-AUC":0.5046},

    {"n_estimators":200, "min_samples_split": 50, "Accuracy":0.4995, "Precision":0.5151, "Recall":0.3132, "F1":0.3896, "ROC-AUC":0.5043},
    {"n_estimators":200, "min_samples_split":100,"Accuracy":0.4987, "Precision":0.5151, "Recall":0.2875, "F1":0.3691, "ROC-AUC":0.5047},

    {"n_estimators":300, "min_samples_split": 50, "Accuracy":0.4993, "Precision":0.5149, "Recall":0.3135, "F1":0.3897, "ROC-AUC":0.5049},
    {"n_estimators":300, "min_samples_split":100,"Accuracy":0.4995, "Precision":0.5166, "Recall":0.2891, "F1":0.3707, "ROC-AUC":0.5047},
]

df_results = pd.DataFrame(data)
df_results


In [None]:
# code for visuilsaution of the meamrents datafarme

import matplotlib.pyplot as plt

# Sort the DataFrame (optional, just for tidiness)
df_results.sort_values(["n_estimators","min_samples_split"], inplace=True)

# Metrics we want to plot (excluding the parameters themselves)
metrics = ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC"]

# Reshape (melt) so we can do a grouped bar chart easily
df_melted = df_results.melt(
    id_vars=["n_estimators","min_samples_split"], 
    value_vars=metrics, 
    var_name="Metric", 
    value_name="Score"
)

# Create a "label" column to represent each combination on the x-axis
df_melted["Combo"] = df_melted.apply(
    lambda row: f"n={row['n_estimators']}, split={row['min_samples_split']}", axis=1
)

# Now we pivot so that each (Combo, Metric) is mapped to a Score
# (This step can be done in different ways, but pivot_table is convenient)
pivot_df = df_melted.pivot_table(
    index="Combo", 
    columns="Metric", 
    values="Score"
)

# Sort combos by n_estimators then min_samples_split in the row index
# (They should already be in that order from the prior sorting.)
pivot_df = pivot_df.loc[pivot_df.index]

# Plot: Grouped bar chart
plt.figure(figsize=(12, 6))  # Make the figure a bit larger
x = range(len(pivot_df.index))  # each index is one group
bar_width = 0.15  # or 0.1, depending on how many metrics you have

for i, metric in enumerate(metrics):
    # shift each bar group by i*bar_width
    plt.bar(
        [pos + i*bar_width for pos in x],
        pivot_df[metric],
        width=bar_width,
        label=metric
    )

plt.xlabel("n_estimators, min_samples_split")
plt.ylabel("Score")
plt.title("Random Forest Performance by Hyperparameters")

# Ticks in the middle of the group
plt.xticks(
    [pos + bar_width*(len(metrics)-1)/2 for pos in x],
    pivot_df.index, 
    rotation=45, 
    ha='right'
)

plt.ylim(0, 1)  # all these metrics range from 0 to 1 in classification tasks
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#use of the visuilzation code

for x, y, val in zip(
    df_results["min_samples_split"], 
    df_results["n_estimators"], 
    df_results[metric]
):
    ax.text(x, y, f"{val:.3f}", ha="center", va="center", fontsize=8, color="white")


## Model Training 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the data: 80% for training, 20% for testing
# Setting shuffle=False preserves the sequential order (important for time series)
train, test = train_test_split(dfcopy, test_size=0.2, shuffle=False)

# List of predictor columns to be used for training
predictors = ["close", "Volume BTC", "open", "high", "low"]

# Set learning parameters:
# - n_estimators: number of decision trees (experiment with this for optimal performance)
# - min_samples_split: minimum samples required to split an internal node (helps protect against overfitting)
# - random_state: ensures reproducible results

#from the check we did we got that the best paramters where n_esimator = 100 and min split = 5

model = RandomForestClassifier(n_estimators=100, min_samples_split=5, random_state=1)

# Train the model using the predictor columns to predict the target
model.fit(train[predictors], train["Target"])

# Optionally, you can print the shapes to verify the split
print("Training set shape:", train.shape)
print("Test set shape:", test.shape)


In [None]:
#This code will check our model , (when we said the market would go up did it acctualy go up)
from sklearn.metrics import precision_score

test = dfcopy.iloc[:-100] # The last 100 rows will be put in the test set
predictors = ["close","Volume BTC","open","high","low"]
#this will generate predictions using our model
preds = model.predict(test[predictors])


In [None]:
# this will convert our predictions into a series fromat (easier to read)
import pandas as pd 
preds = pd.Series(preds,index=test.index) 

In [None]:
#show the prediction array
#preds

In [None]:
#Calculation of the precission score
precision_score(test["Target"],preds)

0.7617517256987666

In [None]:
#this code will save the model so u can keep working on it later
import os
import joblib

# Ensure the directory exists
os.makedirs('models', exist_ok=True)

# Save the model
joblib.dump(model, 'models/random_forest_model.pkl')



In [None]:
#Load the model back up 
import joblib

# Load the saved model from file
model = joblib.load('models/random_forest_model.pkl')


In [None]:
# this function wraps up everything we did before into a function
import pandas as pd
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"]) #fitting of the model using the trainig and target
    preds = model.predict(test[predictors]) #genrating predictions
    preds = pd.Series(preds, index=test.index, name="Predictions") #combining hte model into a series
    combined = pd.concat([test["Target"], preds], axis=1) # combinig everything togehter to a data frame
    return combined



In [None]:
# a backtesting function , which takes the data , ml model , preddictors , start and step value
#start = the amount of that data that will train the model
# start=48000 - > learn on the first 48000 rows (first 8 years of data)
#step = the amount of data that will be learned in each step (we will go from a year to another year)
def backtest(data, model, predictors, start=48000, step=6000):
    all_predictions = [] # list of dataframes of predictions for a singel year

    for i in range(start, data.shape[0], step): # loop across the data year by year
        train = data.iloc[:i].copy() #(writing .copy to avoid the copy warning)
        test = data.iloc[i:i+step].copy() 
        predictions = predict(train, test, predictors, model) # predict function to genreate predictions
        all_predictions.append(predictions) #append the predictions for a giving year

    return pd.concat(all_predictions) #concat all the dataframes togheter


In [None]:
#backtest the data with the model and predictors we calcaulted earlier
predictions = backtest(dfcopy, model, predictors)


In [None]:
#check how many days we predicted the market would go up vs down 
#value_count will count how many times each type of prediction was made
predictions["Predictions"].value_counts()


Predictions
0    18970
1    13213
Name: count, dtype: int64

In [None]:
# look at the precision score
precision_score(predictions["Target"],predictions["Predictions"])

#0.5085900249754031



0.5085900249754031

In [None]:
#benchmark to check the prectange of days where the market acctualy went up
predictions["Target"].value_counts() / predictions.shape[0]
 
#1    0.505484
#0    0.494516

Target
1    0.505484
0    0.494516
Name: count, dtype: float64

In [None]:
#create a variety of rolling avreges , to help identify 
#trends of wheter the stock will go up or down

# trends for the last 2 days , 5 days , 60 days , 250 days , 1000 days
horizons = [24*2,24*5,24*60,24*250,24*1000]
new_predictors = []

for horizon in horizons: # we will loop through the horizions and calcaulte a rolling avrege
    rolling_averages = dfcopy.rolling(horizon).mean()

    #Column for that will contatin the close value of each horizion
    ratio_column = f"close_ratio_{horizon}"

    #calculating the ratio bettwen our current close value and each of the horzions
    #(and adding it to the data frame)
    dfcopy[ratio_column] = dfcopy["close"] / rolling_averages["close"]

    #the number of hours in the current horrizion that the stock price went up
    trend_column = f"Trend_{horizon}"
    dfcopy[trend_column] = dfcopy.shift(1).rolling(horizon).sum()["Target"]

    new_predictors += [ratio_column,trend_column]


In [None]:
dfcopy
# why we have Nans?
#when panads cant find enough rows prior to our current row in order to compute the rolling avrege it gives us nan


In [None]:
#drop the rows where we have na in the columns
dfcopy = dfcopy.dropna()

#we can see now that we dropped a few years since we needed them to calcualte hte trend 1000 and close 1000 days of trading


In [None]:
model = RandomForestClassifier(n_estimators=200,min_samples_split=50,random_state=1)

In [None]:
# changing a little bit our predict fucntion that now will return the probability that the stoock price would go up or down

import pandas as pd
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"]) 
    preds = model.predict_proba(test[predictors])[:,1] #we get the second column of this (the prob that the stock would go up) 
    #now we would set a trehshold that only if the probability is 60% to go up then the model would predict the increase
    preds[preds>=.6] = 1 # it will reduce the number of days we can predict the price to go up , but we would increase the the prediction accuracy
    preds[preds<.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions") 
    combined = pd.concat([test["Target"], preds], axis=1) 
    return combined


In [None]:
#now we let the predictions to realy on the ratio and the trends , since the value or the prices themself dont really give us too much inforamtion
#compared to the ratio or the value compared to the days before 
predictions = backtest(dfcopy,model,new_predictors)

In [None]:
predictions["Predictions"].value_counts()

#the print
#Predictions
#1    4219
#0    3963

Predictions
1    4219
0    3963
Name: count, dtype: int64

In [None]:
precision_score(predictions["Target"],predictions["Predictions"])

0.5117326380658924