In [1]:
# In: notebooks/03_model_training.ipynb

import pandas as pd

# Load the dataset you created in the previous notebook
try:
    df = pd.read_csv('../training_dataset.csv')
    print(f"Successfully loaded dataset with {len(df)} rows.")
except FileNotFoundError:
    print("Error: training_dataset.csv not found. Please run the data preparation notebook first.")

df.head()

Successfully loaded dataset with 1250 rows.


Unnamed: 0,date,Close,High,Low,Open,Volume,ticker,trend_indicator,sentiment,positive_events,negative_events,GDP,CPI,FEDFUNDS,UNRATE,BAMLH0A0HYM2
0,2024-08-21,225.351227,226.923909,224.00749,225.470681,34765500,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
1,2024-08-22,223.489868,227.282216,222.862782,226.734761,43695300,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
2,2024-08-23,225.789169,227.162781,223.290802,224.614643,38677300,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
3,2024-08-26,226.127594,226.227137,222.852841,225.709541,30602200,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
4,2024-08-27,226.973663,227.789872,223.84821,224.953068,35934600,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import numpy as np

if 'df' in locals() and not df.empty:
    # --- 1. Create the Target Variable (y) ---
    # We will predict the next day's closing price change (return)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by=['ticker', 'date'], inplace=True)
    df['target'] = df.groupby('ticker')['Close'].pct_change().shift(-1)

    # --- 2. Create the Feature Matrix (X) ---
    # Drop non-feature columns
    features_to_drop = ['date', 'target', 'ticker']
    X = df.drop(columns=features_to_drop)
    y = df['target']

    # --- 3. Final Cleanup ---
    # Align X and y by dropping rows with NaN in the target
    X = X[y.notna()]
    y = y[y.notna()]
    
    # Fill any remaining NaN in features with the mean
    X.fillna(X.mean(), inplace=True)

    if len(X) > 0:
        # 4. Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # 5. Train the model
        print("\nTraining the Random Forest model...")
        model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        print("Model training complete.")

        # 6. Evaluate the model
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        print(f"\nModel Mean Squared Error on Test Set: {mse}")

        # 7. Save the trained model
        model_filename = '../credit_model.pkl'
        joblib.dump(model, model_filename)
        print(f"\n✅ Model saved to '{model_filename}'")
    else:
        print("\nNot enough data to train a model after processing.")
else:
    print("\nDataFrame not loaded. Please run the first cell.")


Training the Random Forest model...
Model training complete.

Model Mean Squared Error on Test Set: 0.00081236255857211

✅ Model saved to '../credit_model.pkl'


In [3]:
# In a new cell in: 03_model_training.ipynb

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Ensure you have your data loaded and split (X_train, X_test, y_train, y_test)

# 1. Initialize and train the LightGBM model
print("\nTraining LightGBM model...")
lgbm_model = LGBMRegressor(random_state=42)
lgbm_model.fit(X_train, y_train)
print("LightGBM model training complete.")

# 2. Evaluate the new model
lgbm_predictions = lgbm_model.predict(X_test)
lgbm_mse = mean_squared_error(y_test, lgbm_predictions)
print(f"\nLightGBM Model MSE on Test Set: {lgbm_mse}")

# 3. Compare with your previous Random Forest MSE
# (This value was printed when you ran the previous cell)
print(f"Compare to Random Forest MSE: {mse}") 

# 4. (Optional) If LightGBM is better, save it by overwriting the old model file
if lgbm_mse < mse:
    print("\nLightGBM performed better. Saving as 'credit_model.pkl'.")
    joblib.dump(lgbm_model, '../credit_model.pkl')
else:
    print("\nRandom Forest performed better or similarly. Keeping the original model.")


Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 996, number of used features: 6
[LightGBM] [Info] Start training from score 0.001592
LightGBM model training complete.

LightGBM Model MSE on Test Set: 0.0008494035680652171
Compare to Random Forest MSE: 0.00081236255857211

Random Forest performed better or similarly. Keeping the original model.
