<a href="https://colab.research.google.com/github/ToluwaniOyebode/Projects/blob/main/STOCK_MARKET_PREDICTOR(2014_2017).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
from google.colab import files
uploaded = files.upload()  # upload the CSV file directly


Saving S&P 500 Stock Prices 2014-2017.csv to S&P 500 Stock Prices 2014-2017 (2).csv


In [23]:
import pandas as pd
import numpy as np

file_name = next(iter(uploaded.keys()))
df = pd.read_csv(file_name)

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)
df.head()


Shape: (497472, 7)

Columns:
Index(['symbol', 'date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')


Unnamed: 0,symbol,date,open,high,low,close,volume
0,AAL,2014-01-02,25.07,25.82,25.06,25.36,8998943
1,AAPL,2014-01-02,79.3828,79.5756,78.8601,79.0185,58791957
2,AAP,2014-01-02,110.36,111.88,109.29,109.74,542711
3,ABBV,2014-01-02,52.12,52.33,51.52,51.98,4569061
4,ABC,2014-01-02,70.11,70.23,69.48,69.89,1148391


In [24]:
df.columns = df.columns.str.lower()

required_cols = ["date","symbol","open","high","low","close","volume"]

missing = [c for c in required_cols if c not in df.columns]
if missing:
    print("Missing columns:", missing)
else:
    print("All required columns present.")


All required columns present.


In [25]:
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["symbol","date"]).reset_index(drop=True)

g = df.groupby("symbol", sort=False)

# Log returns
df["log_ret"] = np.log(df["close"]).groupby(df["symbol"]).diff()

# Target: next-day direction
df["target"] = (g["log_ret"].shift(-1) > 0).astype("Int64")

# Lag features
for k in [1,2,3,5,10,20]:
    df[f"lag{k}"] = g["log_ret"].shift(k)

# Rolling statistics
for w in [5,10,20,60]:
    df[f"mean{w}"] = g["log_ret"].rolling(w).mean().reset_index(level=0, drop=True)
    df[f"vol{w}"]  = g["log_ret"].rolling(w).std().reset_index(level=0, drop=True)

# Momentum
df["mom5"]  = g["log_ret"].rolling(5).sum().reset_index(level=0, drop=True)
df["mom20"] = g["log_ret"].rolling(20).sum().reset_index(level=0, drop=True)

# Intraday structure
df["range"] = (df["high"] - df["low"]) / df["open"]
df["cl_op"] = (df["close"] - df["open"]) / df["open"]

df = df.replace([np.inf, -np.inf], np.nan)

feature_cols = (
    [f"lag{k}" for k in [1,2,3,5,10,20]] +
    [f"mean{w}" for w in [5,10,20,60]] +
    [f"vol{w}" for w in [5,10,20,60]] +
    ["mom5","mom20","range","cl_op"]
)

df_model = df.dropna(subset=feature_cols + ["target"]).copy()
df_model["target"] = df_model["target"].astype(int)

print("Final rows:", df_model.shape[0])
print("Up-rate:", df_model["target"].mean())


Final rows: 467207
Up-rate: 0.5183441172756401


In [26]:
train = df_model[df_model["date"] < "2017-01-01"]
test  = df_model[df_model["date"] >= "2017-01-01"]

X_train = train[feature_cols]
y_train = train["target"]

X_test  = test[feature_cols]
y_test  = test["target"]

print("Train size:", len(train))
print("Test size:", len(test))


Train size: 341533
Test size: 125674


In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_model = Pipeline([
    ("scale", StandardScaler()),
    ("model", LogisticRegression(max_iter=500))
])

log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

log_acc = accuracy_score(y_test, log_pred)
print("Logistic Accuracy:", log_acc)


Logistic Accuracy: 0.5223116953387336


In [28]:
!pip -q install xgboost


In [29]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy:", xgb_acc)


XGBoost Accuracy: 0.5124926396868087


In [30]:
!pip -q install tensorflow


In [None]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

seq_len = 30

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_model[feature_cols])

df_scaled = df_model.copy()
df_scaled[feature_cols] = X_scaled

def create_sequences(data, features, seq_len):
    X, y = [], []
    for i in range(seq_len, len(data)):
        X.append(data[features].iloc[i-seq_len:i].values)
        y.append(data["target"].iloc[i])
    return np.array(X), np.array(y)

train_s = df_scaled[df_scaled["date"] < "2017-01-01"]
test_s  = df_scaled[df_scaled["date"] >= "2017-01-01"]

X_train_seq, y_train_seq = create_sequences(train_s, feature_cols, seq_len)
X_test_seq, y_test_seq   = create_sequences(test_s, feature_cols, seq_len)

model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(seq_len, len(feature_cols))),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.fit(X_train_seq, y_train_seq,
          epochs=5,
          batch_size=256,
          validation_split=0.1)

lstm_acc = model.evaluate(X_test_seq, y_test_seq, verbose=0)[1]
print("LSTM Accuracy:", lstm_acc)


In [None]:
print("\nFINAL COMPARISON")
print("Logistic:", log_acc)
print("XGBoost :", xgb_acc)
print("LSTM    :", lstm_acc)
