## Machine learing Prediction

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Paths
NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR
DATA_DIR = ROOT / "Data"
DATA_PATH = DATA_DIR / "final_sentiment_dataset.csv"

# Load data
df = pd.read_csv(DATA_PATH, parse_dates=["Date"])
print(f"Loaded {len(df)} rows from {DATA_PATH}")
print(df.head())

Loaded 81 rows from d:\Financial News Sentiment Analysis\Data\final_sentiment_dataset.csv
        Date        Open        High         Low       Close   Adj Close  \
0 2020-06-09   83.035004   86.402496   83.002502   85.997498   83.889359   
1 2020-06-09  126.472000  131.321503  126.250000  130.042999  130.042999   
2 2011-05-23   38.970001   39.090000   38.700001   38.779999   38.779999   
3 2011-06-08   37.889999   37.889999   37.040001   37.389999   37.389999   
4 2011-07-01   39.889999   40.160000   39.459999   39.650002   39.650002   

      Volume                                          Headlines  Target  \
0  147712400  Why Apple's Stock Is Trading Higher Today Appl...       1   
1  103520000  'Inside Amazon's plan to test warehouse worker...       1   
2      13400      American Drivers Should Thank European Voters       0   
3      38900                                   The End of OPEC?       1   
4       9100  Is China's Slowdown Bullish for the Global Eco...       1   

  

In [2]:
# Feature selection and preprocessing
feature_cols = [
    "finbert_positive", "finbert_negative", "finbert_neutral",
    "vader_compound", "Open", "High", "Low", "Close", "Volume", "SMA_5"
]

# If Ticker missing, create a single ticker label to allow grouping
df = df.copy()
if "Ticker" not in df.columns:
    df["Ticker"] = "TCKR"
    print("Info: 'Ticker' missing; filled with single label 'TCKR'.")

# Ensure SMA_5 exists; compute if missing
if "SMA_5" not in df.columns:
    df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)
    df["SMA_5"] = df.groupby("Ticker")['Close'].transform(lambda s: s.rolling(5, min_periods=1).mean())

# Determine available columns
available_features = [c for c in feature_cols if c in df.columns]
required_cols = available_features + [c for c in ["Target", "Ticker", "Date"] if c in df.columns]
missing_required = [c for c in ["Target", "Ticker", "Date"] if c not in df.columns]
if missing_required:
    print("Warning: missing required columns, rows may be incomplete:", missing_required)

if "Target" not in df.columns:
    raise ValueError("Target column missing; rebuild dataset to include 'Target'.")

# Drop rows with missing available features/target
if required_cols:
    df = df.dropna(subset=required_cols).reset_index(drop=True)
else:
    raise ValueError("No required columns available to proceed.")

X = df[available_features]
y = df["Target"].astype(int)

print("Feature columns used:", available_features)
print("X shape:", X.shape, "y shape:", y.shape)
print("Class distribution:\n", y.value_counts())

Info: 'Ticker' missing; filled with single label 'TCKR'.
Feature columns used: ['finbert_positive', 'finbert_negative', 'finbert_neutral', 'vader_compound', 'Open', 'High', 'Low', 'Close', 'Volume', 'SMA_5']
X shape: (81, 10) y shape: (81,)
Class distribution:
 Target
1    50
0    31
Name: count, dtype: int64


In [3]:
# Random Forest classifier
class_counts = y.value_counts()
min_class = class_counts.min()
use_stratify = min_class >= 2
if not use_stratify:
    print("Note: Not enough samples per class for stratified split; using simple split without stratify.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2 if len(y) > 4 else 0.33,
    shuffle=True,
    stratify=y if use_stratify else None,
    random_state=42,
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=3))

Random Forest Accuracy: 0.47058823529411764

Classification report:
               precision    recall  f1-score   support

           0      0.000     0.000     0.000         7
           1      0.533     0.800     0.640        10

    accuracy                          0.471        17
   macro avg      0.267     0.400     0.320        17
weighted avg      0.314     0.471     0.376        17



In [4]:
# Save Random Forest model and feature names
from pathlib import Path
import joblib
import numpy as np

out_dir = ROOT / "ml" / "serving"
out_dir.mkdir(parents=True, exist_ok=True)
rf_path = out_dir / "model.joblib"
# ensure feature names stored (sklearn may expect feature_names_in_)
try:
    rf.feature_names_in_ = np.array(available_features)
except Exception:
    pass
joblib.dump(rf, rf_path)
print(f"Saved RandomForest model to {rf_path}")

Saved RandomForest model to d:\Financial News Sentiment Analysis\ml\serving\model.joblib


### LSTM sequence model

Sequence last 7 days of features to predict the next day (day 8).

In [5]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

SEQ_LEN = 7
lstm_features = available_features

if len(lstm_features) == 0:
    raise ValueError("No feature columns available for LSTM.")

# Sort by ticker/date
seq_df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)

# Fit scaler on training portion (per-ticker split)
scaler = StandardScaler()
train_mask = pd.Series(False, index=seq_df.index)
for _, g in seq_df.groupby("Ticker"):
    split_idx = int(len(g) * 0.8)
    train_mask.loc[g.index[:split_idx]] = True
scaler.fit(seq_df.loc[train_mask, lstm_features])

seq_df_scaled = seq_df.copy()
seq_df_scaled[lstm_features] = scaler.transform(seq_df_scaled[lstm_features])


def build_sequences_split(dataframe, feature_cols, target_col, seq_len=7, split_ratio=0.8):
    X_train, y_train, X_test, y_test = [], [], [], []
    for _, g in dataframe.groupby("Ticker"):
        g = g.sort_values("Date").reset_index(drop=True)
        if len(g) <= seq_len:
            continue  # not enough rows to form one sequence
        feats = g[feature_cols].values
        target = g[target_col].values
        split_idx = int(len(g) * split_ratio)
        for i in range(len(g) - seq_len):
            end_idx = i + seq_len
            X_seq = feats[i:end_idx]
            y_val = target[end_idx]
            if end_idx <= split_idx:
                X_train.append(X_seq)
                y_train.append(y_val)
            else:
                X_test.append(X_seq)
                y_test.append(y_val)
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

X_train_seq, y_train_seq, X_test_seq, y_test_seq = build_sequences_split(
    seq_df_scaled, lstm_features, "Target", seq_len=SEQ_LEN, split_ratio=0.8
)

print("Sequence shapes:")
print("X_train_seq:", X_train_seq.shape, "y_train_seq:", y_train_seq.shape)
print("X_test_seq:", X_test_seq.shape, "y_test_seq:", y_test_seq.shape)

if len(X_train_seq) == 0 or len(X_test_seq) == 0:
    raise ValueError("Not enough sequence data for LSTM; need more rows per ticker.")

# Build LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(SEQ_LEN, len(lstm_features))),
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss="binary_crossentropy",
              metrics=["accuracy"])

history = model.fit(
    X_train_seq, y_train_seq,
    validation_split=0.2,
    epochs=10,
    batch_size=16,
    verbose=1
)

# Evaluate
y_pred_prob = model.predict(X_test_seq)
y_pred_label = (y_pred_prob.flatten() >= 0.5).astype(int)

lstm_acc = accuracy_score(y_test_seq, y_pred_label)
print(f"\nLSTM Accuracy: {lstm_acc:.3f}")
print("\nClassification report:\n", classification_report(y_test_seq, y_pred_label, digits=3))

Sequence shapes:
X_train_seq: (58, 7, 10) y_train_seq: (58,)
X_test_seq: (16, 7, 10) y_test_seq: (16,)
Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 226ms/step - accuracy: 0.4565 - loss: 0.6999 - val_accuracy: 0.5833 - val_loss: 0.6922
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5217 - loss: 0.6932 - val_accuracy: 0.5833 - val_loss: 0.6917
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.5217 - loss: 0.6927 - val_accuracy: 0.5833 - val_loss: 0.6918
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6304 - loss: 0.6829 - val_accuracy: 0.5833 - val_loss: 0.6927
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6087 - loss: 0.6738 - val_accuracy: 0.5833 - val_loss: 0.6934
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.5652

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
# Save LSTM model and preprocessing artifacts
from pathlib import Path
import joblib

out_dir = ROOT / "ml" / "serving"
out_dir.mkdir(parents=True, exist_ok=True)
lstm_path = out_dir / "lstm_model.keras"
# Save Keras model in Keras native format
model.save(lstm_path, save_format="keras")

prep_path = out_dir / "lstm_prep.joblib"
try:
    joblib.dump({"scaler": scaler, "features": lstm_features}, prep_path)
    print(f"Saved LSTM model to {lstm_path} and preprocessing to {prep_path}")
except Exception as e:
    print(f"Saved LSTM model to {lstm_path} but failed to save preprocessing: {e}")



Saved LSTM model to d:\Financial News Sentiment Analysis\ml\serving\lstm_model.keras and preprocessing to d:\Financial News Sentiment Analysis\ml\serving\lstm_prep.joblib
