<a href="https://colab.research.google.com/github/SaraAsgharQ/Stock-Price-Prediction/blob/main/stock_Price_Movement_Prediction_Base_Paper_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yfinance pandas-ta scikit-learn tensorflow



In [None]:
import yfinance as yf
from datetime import datetime

# --- Setup for the data fetching (Assume this was in the first cell) ---
TICKER = "^KSE"
START_DATE = datetime(2010, 1, 1)
END_DATE = datetime(2023, 10, 1)

print(f"Fetching data for {TICKER}...")

data = yf.download(TICKER, start=START_DATE, end=END_DATE)

if data.empty:
    print("FATAL ERROR: Data could not be fetched with any ticker. Please verify the ticker is active on Yahoo Finance.")
else:
    print(f"Data fetched successfully. Total rows: {len(data)}")
    print("Data Columns after fetch:", data.columns.tolist())

Fetching data for ^KSE...


  data = yf.download(TICKER, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed

Data fetched successfully. Total rows: 2809
Data Columns after fetch: [('Close', '^KSE'), ('High', '^KSE'), ('Low', '^KSE'), ('Open', '^KSE'), ('Volume', '^KSE')]





In [None]:
pip install TA-Lib

Collecting TA-Lib
  Downloading ta_lib-0.6.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Downloading ta_lib-0.6.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (4.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: TA-Lib
Successfully installed TA-Lib-0.6.8


In [None]:
import pandas as pd
import talib as ta
import numpy as np

data.columns = ['Close', 'High', 'Low', 'Open', 'Volume']
data = data.sort_index()


In [None]:
# ----------------- TECHNICAL INDICATOR CALCULATIONS -----------------


data.index = pd.to_datetime(data.index)


# --- 1. Moving Averages and Related Indicators (9 Features) ---

# Auxiliary Moving Averages (Needed for other indicators)
data['MA_5'] = ta.SMA(data['Close'], timeperiod=5)
data['MA_10'] = ta.SMA(data['Close'], timeperiod=10)
data['MA_14'] = ta.SMA(data['Close'], timeperiod=14)

# 1. Exponential Moving Average (EMA) - Feature 1
data['EMA_12'] = ta.EMA(data['Close'], timeperiod=12)

# 2. Weighted Moving Average (WMA) - Feature 2
data['WMA_14'] = ta.WMA(data['Close'], timeperiod=14)

# 3. Disparity 5 (Close/MA5 * 100) - Feature 3 [cite: 94]
data['Disparity 5'] = (data['Close'] / data['MA_5']) * 100

# 4. Disparity 14 (Close/MA14 * 100) - Feature 4 [cite: 94]
data['Disparity 14'] = (data['Close'] / data['MA_14']) * 100

# 5. Price Oscillator (OSCP) (MA5 - MA10) - Feature 5 [cite: 94]
data['OSCP'] = data['MA_5'] - data['MA_10']

# 6. Upper Band (Bollinger Band) - Feature 6
# 7. Lower Band (Bollinger Band) - Feature 7
data['Upper_BB'], data['Middle_BB'], data['Lower_BB'] = ta.BBANDS(
    data['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0
)

# 8. MACD - Feature 8
# 9. Signal Line - Feature 9
data['MACD'], data['MACD_Signal'], _ = ta.MACD(
    data['Close'], fastperiod=12, slowperiod=26, signalperiod=9
)

# --- 2. Momentum and Volatility Indicators (10 Features) ---

# 10. Relative Strength Index (RSI) - Feature 10
data['RSI'] = ta.RSI(data['Close'], timeperiod=14)

# 11. Rate of Change (ROC) - Feature 11
data['ROC'] = ta.ROC(data['Close'], timeperiod=10)

# 12. Momentum (close - close_4) - Feature 12 [cite: 94]
data['Momentum'] = data['Close'].diff(4)

# 13. Williams %R (%R) - Feature 13
data['%R'] = ta.WILLR(data['High'], data['Low'], data['Close'], timeperiod=14)

# 14. Commodity Channel Index (CCI) - Feature 14
data['CCI'] = ta.CCI(data['High'], data['Low'], data['Close'], timeperiod=14)

# 15. Average True Range (ATR) - Feature 15
data['ATR'] = ta.ATR(data['High'], data['Low'], data['Close'], timeperiod=14)

# 16. Stochastic %K (%K) - Feature 16
data['%K'], data['%D'] = ta.STOCH(
    data['High'], data['Low'], data['Close'],
    fastk_period=5, slowk_period=3, slowd_period=3
)
# 17. Stochastic %D (%D) - Feature 17
# %D is calculated in the previous step.

# --- 3. Volume Indicators (3 Features) ---

# 18. On-Balance Volume (OBV) - Feature 18
data['OBV'] = ta.OBV(data['Close'], data['Volume'])

# 19. Chaikin Oscillator (ADOSC) - Feature 19
# TA-Lib ADOSC implements Chaikin Oscillator (3/10 periods are common defaults)
data['Chaikin_Oscillator'] = ta.ADOSC(data['High'], data['Low'], data['Close'], data['Volume'], fastperiod=3, slowperiod=10)

# 20. Money Flow Index (MFI) - Feature 20
data['MFI'] = ta.MFI(data['High'], data['Low'], data['Close'], data['Volume'], timeperiod=14)


# --- 4. Pivot Points (5 Features) ---

# 21. Pivot Point (PP) - Feature 21 [cite: 94]
data['PP'] = (data['High'] + data['Low'] + data['Close']) / 3

# 22. First Support (S1) - Feature 22 [cite: 94]
data['S1'] = (data['PP'] * 2) - data['High']

# 23. Second Support (S2) - Feature 23 [cite: 94]
data['S2'] = data['PP'] - (data['High'] - data['Low'])

# 24. First Resistance (R1) - Feature 24 [cite: 94]
# Note: The paper's formula for R1 uses 'lou' (low), which is a common typo for 'low'.
data['R1'] = (data['PP'] * 2) - data['Low']

# 25. Second Resistance (R2) - Feature 25 [cite: 94]
data['R2'] = data['PP'] + (data['High'] - data['Low'])


# --- 5. Anomaly Features (2 Features) ---

# 26. Day of Week Anomaly - Feature 26 [cite: 97]
# Formula: close / (close.groupby(Day of W).trans(mean))
data['Day of Week Anomaly'] = data['Close'] / data['Close'].groupby(data.index.dayofweek).transform('mean')

# 27. Week of Month Anomaly - Feature 27 [cite: 97]
# Formula: close / (close.groupby(W of M).trans(mean))
# Week of Month (1-5) calculation:
data['Week of Month'] = (data.index.day - 1) // 7 + 1
data['Week of Month Anomaly'] = data['Close'] / data['Close'].groupby(data['Week of Month']).transform('mean')

# --- Final Cleanup ---

# Drop the temporary/auxiliary columns (MA_5, MA_10, MA_14, Middle_BB, MACD_Signal, Week of Month)
data = data.drop(columns=['MA_5', 'MA_10', 'MA_14', 'Middle_BB', 'MACD_Signal', 'Week of Month'])

# Drop all rows where any indicator value is NaN
data = data.dropna()

# Check the final count of features and rows
print(f"Total columns (5 OHLCV + 27 Features): {data.shape[1]}")
print(f"DataFrame Shape after adding 27 features and dropping NaNs: {data.shape}")
print("\nFinal list of technical feature columns:")
# Print only the feature columns (skipping the first 5 base columns)
print(data.columns[5:].tolist())
print("\nFirst 5 rows of the feature-rich data:")
print(data.head())

Total columns (5 OHLCV + 27 Features): 31
DataFrame Shape after adding 27 features and dropping NaNs: (2776, 31)

Final list of technical feature columns:
['EMA_12', 'WMA_14', 'Disparity 5', 'Disparity 14', 'OSCP', 'Upper_BB', 'Lower_BB', 'MACD', 'RSI', 'ROC', 'Momentum', '%R', 'CCI', 'ATR', '%K', '%D', 'OBV', 'Chaikin_Oscillator', 'MFI', 'PP', 'S1', 'S2', 'R1', 'R2', 'Day of Week Anomaly', 'Week of Month Anomaly']

First 5 rows of the feature-rich data:
                  Close         High          Low         Open  Volume  \
Date                                                                     
2010-02-23  9823.570312  9973.589844  9806.370117  9953.200195       0   
2010-02-24  9686.179688  9850.580078  9673.179688  9850.580078       0   
2010-02-25  9667.169922  9741.230469  9575.129883  9701.219727       0   
2010-02-26  9657.790039  9716.969727  9653.769531  9691.839844       0   
2010-03-01  9498.559570  9684.519531  9471.799805  9658.450195       0   

                 EMA_1

In [None]:
# Calculating the daily percentage change (future price relative to current price)
# Using .shift(-1) to look forward: the price on T+1 is placed on the row for T.
data['Future Close Price'] = data['Close'].shift(-1)

# Creating the target variable: 'Price Movement'
# 1 if the future close price is greater than the current close price.
# 0 otherwise.
data['Price Movement'] = np.where(data['Future Close Price'] > data['Close'], 1, 0)

# Cleaning up the DataFrame
# Drop the 'Future Close Price' column as it is only an intermediate step
# and should not be used as a feature in the model.
data = data.drop(columns=['Future Close Price'])

# Drop the last row, which will have a NaN value in 'Price Movement'
# because there is no T+1 close price to calculate the movement.
data = data.dropna()

print("Target variable 'Price Movement' created.")
print(f"Final DataFrame Shape: {data.shape}")

# Check the distribution of the target variable
movement_counts = data['Price Movement'].value_counts()
print("\nTarget Variable Distribution:")
print(movement_counts)
print(f"Proportion of 'Up' days (1): {movement_counts[1] / len(data):.2f}")

print("\nFirst 5 rows with the new target variable:")
print(data.tail())

Target variable 'Price Movement' created.
Final DataFrame Shape: (2776, 32)

Target Variable Distribution:
Price Movement
1    1487
0    1289
Name: count, dtype: int64
Proportion of 'Up' days (1): 0.54

First 5 rows with the new target variable:
                   Close          High           Low          Open  Volume  \
Date                                                                         
2021-08-31  47419.738281  47596.039062  47349.488281  47365.699219       0   
2021-09-01  47413.460938  47628.589844  47355.140625  47419.738281       0   
2021-09-02  46903.058594  47489.398438  46874.640625  47413.460938       0   
2021-09-03  46957.468750  47113.421875  46879.281250  46903.058594       0   
2021-09-30  44899.601562  44899.601562  43972.089844  44366.738281       0   

                  EMA_12        WMA_14  Disparity 5  Disparity 14        OSCP  \
Date                                                                            
2021-08-31  47435.595943  47444.524777   100.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Define Features (X) and Target (y)
# Features (X) are all columns EXCEPT the 'Price Movement' target variable.
# We must exclude the original OHLCV columns and only keep the technical features.
# The original OHLCV columns (Close, High, Low, Open, Volume) are not typically used as direct features.
# Based on the feature list provided in your output, the features start from 'SMA_50'.
feature_cols = data.columns[5:-1] # Selects from 'SMA_50' up to (but not including) 'Price Movement'

X = data[feature_cols]
y = data['Price Movement']

# 2. Time-Based Train-Test Split (80% Train, 20% Test)
# For time series, we slice the data based on index position, not random selection.
split_ratio = 0.8
split_index = int(np.floor(split_ratio * len(data)))

X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

# 3. Feature Scaling
# Scaling is crucial for ANN and SVM and is applied AFTER the split to prevent data leakage.
scaler = StandardScaler()

# Fit the scaler ONLY on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Apply the transformation to both training and testing data
X_test_scaled = scaler.transform(X_test)

# Convert the scaled arrays back to DataFrames for easier handling (optional but helpful)
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X.columns)

# 4. Final Verification
print("--- Data Split and Scaling Complete ---")
print(f"Total rows in data: {len(data)}")
print(f"X_train (Train Features) shape: {X_train_scaled.shape}")
print(f"y_train (Train Target) shape: {y_train.shape}")
print(f"X_test (Test Features) shape: {X_test_scaled.shape}")
print(f"y_test (Test Target) shape: {y_test.shape}")
print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())

--- Data Split and Scaling Complete ---
Total rows in data: 2776
X_train (Train Features) shape: (2220, 26)
y_train (Train Target) shape: (2220,)
X_test (Test Features) shape: (556, 26)
y_test (Test Target) shape: (556,)

First 5 rows of scaled training features:
              EMA_12    WMA_14  Disparity 5  Disparity 14      OSCP  Upper_BB  \
Date                                                                            
2010-02-23 -1.419438 -1.419482    -0.672445     -0.050257  0.074972 -1.426499   
2010-02-24 -1.421097 -1.420623    -1.579203     -0.710247 -0.008707 -1.426499   
2010-02-25 -1.422730 -1.422031    -1.357657     -0.810203 -0.129670 -1.426500   
2010-02-26 -1.424225 -1.423566    -1.006699     -0.817069 -0.239940 -1.426713   
2010-03-01 -1.427409 -1.426680    -1.637580     -1.448541 -0.432456 -1.425921   

            Lower_BB      MACD       RSI       ROC  ...       OBV  \
Date                                                ...             
2010-02-23 -1.410503 -0.158633

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# Convert scaled DataFrames to numpy arrays for Keras/ANN
X_train_array = X_train_scaled.values
X_test_array = X_test_scaled.values

# Define the number of features
n_features = X_train_array.shape[1]

# --------------------------------------------
# ---  Artificial Neural Network (ANN) ---
# --------------------------------------------

print("--- Training Artificial Neural Network (ANN) ---")

ann_model = Sequential([
    # Input layer with 128 neurons and ReLU activation
    Dense(128, activation='relu', input_shape=(n_features,)),
    # Hidden layer with 64 neurons
    Dense(64, activation='relu'),
    # Output layer with 1 neuron (for binary classification) and Sigmoid activation
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

ann_model.fit(X_train_array, y_train, epochs=50, batch_size=32, verbose=0)

# Predict and evaluate
ann_pred_proba = ann_model.predict(X_test_array, verbose=0)
ann_pred = (ann_pred_proba > 0.5).astype(int)
ann_accuracy = accuracy_score(y_test, ann_pred)
print(f"ANN Test Accuracy: {ann_accuracy:.4f}")
print("ANN Classification Report:\n", classification_report(y_test, ann_pred, zero_division=0))


--- Training Artificial Neural Network (ANN) ---
ANN Test Accuracy: 0.5216
ANN Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.15      0.23       263
           1       0.53      0.86      0.65       293

    accuracy                           0.52       556
   macro avg       0.50      0.50      0.44       556
weighted avg       0.51      0.52      0.45       556



In [None]:
# --------------------------------------------
# --- Support Vector Machine (SVM) ---
# --------------------------------------------
print("--- Training Support Vector Machine (SVM) ---")

svm_model = SVC(kernel='rbf', random_state=42)

svm_model.fit(X_train_scaled, y_train)

svm_pred = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Test Accuracy: {svm_accuracy:.4f}")
print("SVM Classification Report:\n", classification_report(y_test, svm_pred, zero_division=0))


--- Training Support Vector Machine (SVM) ---
SVM Test Accuracy: 0.5378
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.40      0.45       263
           1       0.55      0.66      0.60       293

    accuracy                           0.54       556
   macro avg       0.53      0.53      0.53       556
weighted avg       0.53      0.54      0.53       556



In [None]:
# --------------------------------------------
# ---  Random Forest (RF) ---
# --------------------------------------------
print("--- Training Random Forest (RF) ---")

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
rf_pred = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"RF Test Accuracy: {rf_accuracy:.4f}")
print("RF Classification Report:\n", classification_report(y_test, rf_pred, zero_division=0))

--- Training Random Forest (RF) ---
RF Test Accuracy: 0.5342
RF Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.61      0.55       263
           1       0.57      0.47      0.51       293

    accuracy                           0.53       556
   macro avg       0.54      0.54      0.53       556
weighted avg       0.54      0.53      0.53       556



In [None]:
# --------------------------------------------
# ---  Long Short-Term Memory (LSTM) ---
# --------------------------------------------

from tensorflow.keras.layers import LSTM

warnings.filterwarnings('ignore', category=UserWarning)


# --- 1. Reshape Data for LSTM ---
# LSTM expects input data in 3D format: [samples, time_steps, features].
# For predicting the next day (time_steps=1), we simply add an axis.
# X_train_scaled.shape is (810, 31). Reshaping to (810, 1, 31).

X_train_lstm = X_train_scaled.values.reshape(
    X_train_scaled.shape[0], 1, X_train_scaled.shape[1]
)
X_test_lstm = X_test_scaled.values.reshape(
    X_test_scaled.shape[0], 1, X_test_scaled.shape[1]
)

print(f"X_train reshaped for LSTM: {X_train_lstm.shape}")
print(f"X_test reshaped for LSTM: {X_test_lstm.shape}")

# Define the number of features
n_features = X_train_lstm.shape[2]

print("\n--- Training Long Short-Term Memory (LSTM) ---")

# Simple LSTM Architecture
lstm_model = Sequential([
    # LSTM layer: 50 units, input shape (time_steps, features)
    LSTM(50, input_shape=(X_train_lstm.shape[1], n_features), activation='relu'),
    # Dense hidden layer
    Dense(25, activation='relu'),
    # Output layer: 1 neuron for binary classification (0 or 1)
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=0)

lstm_pred_proba = lstm_model.predict(X_test_lstm, verbose=0)
lstm_pred = (lstm_pred_proba > 0.5).astype(int)
lstm_accuracy = accuracy_score(y_test, lstm_pred)

print(f"LSTM Test Accuracy: {lstm_accuracy:.4f}")
print("LSTM Classification Report:\n", classification_report(y_test, lstm_pred, zero_division=0))

X_train reshaped for LSTM: (2220, 1, 26)
X_test reshaped for LSTM: (556, 1, 26)

--- Training Long Short-Term Memory (LSTM) ---
LSTM Test Accuracy: 0.5270
LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.17      0.25       263
           1       0.53      0.85      0.65       293

    accuracy                           0.53       556
   macro avg       0.52      0.51      0.45       556
weighted avg       0.52      0.53      0.46       556

