<a href="https://colab.research.google.com/github/Sruthij93/Stock-Price-Prediction-Sentiment-Analysis/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install supabase
!pip install pandas


In [None]:
import os
from supabase import create_client
import pandas as pd
from google.colab import userdata
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

url = userdata.get("SUPABASE_URL")
key = userdata.get("SUPABASE_KEY")
alphavantage_key = userdata.get("ALPHA_VANTAGE_API_KEY")
finnhub_key = userdata.get("FINNHUB_API_KEY")
supabase = create_client(url, key)

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# fetch data from supabase from stock_prices table and for 'AAPL' ticker
table = supabase.table("stock_prices")
response = table.select("*").eq("ticker", "AAPL").execute()


In [None]:
# convert response to dict
data = response.data
# type(data)
# convert data to df
df = pd.DataFrame(data)
df.tail(10)

In [None]:
df.columns

In [None]:
df.drop(columns = ['sentiment_score'], inplace=True)

In [None]:
# create a day and month column
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
# add a flag to check if there is a sentiment score or not based on num_articles
df['has_sentiment'] = df['num_articles'].apply(lambda x: 1 if x>0 else 0)

In [None]:
df.tail(5)

In [None]:
df.fillna(0, inplace=True)

In [None]:
df.info()

## Train Two Models:
1. Using only price data
2. Using price + sentiment features

In [None]:
# features for both models
price_features = ['open', 'high', 'low', 'close', 'volume', 'day', 'month']
sentiment_features = ['avg_sentiment', 'sentiment_ma3', 'sentiment_lag1', 'log_article_count', 'sentiment_volatility', 'sentiment_close_corr', 'has_sentiment']

In [None]:
df[price_features].values.shape

In [None]:
# create datasets
X_price = df[price_features].values
X_sentiment = df[price_features + sentiment_features].values

In [None]:
print(X_price.shape)
print(X_sentiment.shape)

In [None]:
# creating targets for next 5 days
y_multi = np.column_stack([
    df['close'].shift(-1).values,
    df['close'].shift(-2).values,
    df['close'].shift(-3).values,
    df['close'].shift(-4).values,
    df['close'].shift(-5).values
])
# fill NANs in the last 5 rows with last known close price
fill_value = df['close'].iloc[-1]
for i in range(5):
    y_multi[-5+i][-(i+1):] = fill_value

In [None]:
y_multi[-5:]

In [None]:
# fn to create sequences for LSTM
def create_sequences(data, targets, lookback=30):
    X_seq, y_seq = [], []
    for i in range(len(data) - lookback):
        X_seq.append(data[i:i+lookback])
        y_seq.append(targets[i+lookback])
    return np.array(X_seq), np.array(y_seq)

In [None]:
# scale the data
scaler_price = MinMaxScaler()
scaler_sentiment = MinMaxScaler()
scaler_sentiment_only = MinMaxScaler()
scaler_y = MinMaxScaler()


In [None]:
# fit the scalers
X_price_scaled = scaler_price.fit_transform(X_price)
X_with_sentiment_scaled = scaler_sentiment.fit_transform(X_sentiment)
y_scaled = scaler_y.fit_transform(y_multi)
y_single_scaled = scaler_y_single.fit_transform(y_single.reshape(-1, 1))


In [None]:
# pickle dump the scalers
import pickle
with open('scaler_price.pkl', 'wb') as f:
    pickle.dump(scaler_price, f)
with open('scaler_sentiment.pkl', 'wb') as f:
    pickle.dump(scaler_sentiment, f)
with open('scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)

In [None]:
# create sequences of 30 days
X_price_seq, y_price_seq = create_sequences(X_price_scaled, y_scaled, lookback=30)
X_sentiment_seq, y_sentiment_seq = create_sequences(X_with_sentiment_scaled, y_scaled, lookback=30)


In [None]:
X_price_seq.shape

In [None]:
# split data without shuffling (80% train, 10% validation, 10% test)
total_size = len(X_price_seq)
train_end = int(total_size * 0.8)
val_end = int(total_size * 0.9)

X_price_train = X_price_seq[:train_end]
X_price_val = X_price_seq[train_end:val_end]
X_price_test = X_price_seq[val_end:]
y_price_train = y_price_seq[:train_end]
y_price_val = y_price_seq[train_end:val_end]
y_price_test = y_price_seq[val_end:]

X_sentiment_train = X_sentiment_seq[:train_end]
X_sentiment_val = X_sentiment_seq[train_end:val_end]
X_sentiment_test = X_sentiment_seq[val_end:]
y_sentiment_train = y_sentiment_seq[:train_end]
y_sentiment_val = y_sentiment_seq[train_end:val_end]
y_sentiment_test = y_sentiment_seq[val_end:]



In [None]:
X_price_train.shape

In [None]:
y_price_test.shape

In [None]:
# Model 1: Price only model
model_price = Sequential([
    LSTM(128, return_sequences = True, input_shape = (30, X_price_train.shape[2])),
    Dropout(0.4),
    LSTM(64),
    Dropout(0.4),
    Dense(16, activation = 'relu'),
    Dense(5)
])
model_price.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
# Model 2: With sentiment
model_with_sentiment = Sequential([
    LSTM(32, return_sequences = True, input_shape = (30, X_sentiment_train.shape[2])),
    Dropout(0.4),
    LSTM(32),
    Dropout(0.4),
    Dense(16, activation = 'relu'),
    Dense(5)
])
model_with_sentiment.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
# early stopping to prevent overfitting and model uses best weights
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

In [None]:
# Train Model 1 (price only)
print("Training price-only model: ")
history_price = model_price.fit(
    X_price_train, y_price_train,
    epochs=150,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)



In [None]:
# Train Model 2 (with sentiment)
print("Training model with sentiment: ")
history_sentiment = model_with_sentiment.fit(
    X_sentiment_train, y_sentiment_train,
    epochs=150,
    batch_size=32,
    validation_split = 0.2,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# evaluate models
price_mae = model_price.evaluate(X_price_test, y_price_test)[1]
sentiment_mae = model_with_sentiment.evaluate(X_sentiment_test, y_sentiment_test)[1]

print(f"Price-only model MAE: {price_mae:.4f}")
print(f"Sentiment model MAE: {sentiment_mae:.4f}")

In [None]:
print(f"Best validation MAE for Price model: {min(history_price.history['val_mae']):.4f}")
print(f"Best validation MAE for Sentiment model: {min(history_sentiment.history['val_mae']):.4f}")


In [None]:
# function to make predictions for the next 5 days
def predict_next_5_days(model, recent, scaler_x, scaler_y):

    scaled_input = scaler_x.transform(recent)

    # reshape to get one prediction: 30 days & feature shape
    reshaped_input = scaled_input.reshape(1, 30, scaled_input.shape[1])
    print(reshaped_input.shape)

    scaled_prediction = model.predict(reshaped_input)

    # inverse transform to get actual prices
    prediction = scaler_y.inverse_transform(scaled_prediction)

    return prediction[0]

In [None]:
# last 30 days of data for prediction
latest_window_price = X_price[-30:]
latest_window_sentiment = X_sentiment[-30:]


In [None]:
latest_window_sentiment.shape

In [None]:
# predict with both models
price_prediction = predict_next_5_days(model_price, latest_window_price, scaler_price, scaler_y)
sentiment_prediction = predict_next_5_days(model_with_sentiment, latest_window_sentiment, scaler_sentiment, scaler_y)


In [None]:
print("5 day Prediction:")
print("Day |   Price Model    | Sentiment Model")
for i in range(5):
    print(f" {i+1}  |     ${price_prediction[i]:.2f}      |     ${sentiment_prediction[i]:.2f} ")

In [None]:
# pickle dump both models
import pickle
with open('model_price.pkl', 'wb') as f:
    pickle.dump(model_price, f)
with open('model_with_sentiment.pkl', 'wb') as f:
    pickle.dump(model_with_sentiment, f)

In [None]:
# plot training history
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history_price.history['loss'], label='Price Model Training Loss')
plt.plot(history_price.history['val_loss'], label='Price Model Validation Loss')
plt.plot(history_sentiment.history['loss'], label='Sentiment Model Training Loss')
plt.plot(history_sentiment.history['val_loss'], label='Sentiment Model Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_price.history['mae'], label='Price Model Training MAE')
plt.plot(history_price.history['val_mae'], label='Price Model Validation MAE')
plt.plot(history_sentiment.history['mae'], label='Sentiment Model Training MAE')
plt.plot(history_sentiment.history['val_mae'], label='Sentiment Model Validation MAE')
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.show()



In [None]:
# Plot predictions vs actual values for the test set
plt.figure(figsize=(12, 6))
plt.title('Stock Price Predictions on Test Set (next 5 days)')
plt.xlabel('Time')
plt.ylabel('Price ($)')

# actual test set values
y_test_actual = scaler_y.inverse_transform(y_price_test)

# predictions on test set for price model
y_test_pred = model_price.predict(X_price_test)
y_test_pred = scaler_y.inverse_transform(y_test_pred)

# prediction for the sentiment model
y_test_pred_sentiment = model_with_sentiment.predict(X_sentiment_test)
y_test_pred_sentiment = scaler_y.inverse_transform(y_test_pred_sentiment)


# actual vs predicted plot (just 1st prediction from the 5 day prediction window)
plt.plot(y_test_actual[:, 0], label='Actual Prices', color='blue')
plt.plot(y_test_pred[:, 0], label='Price Model Predictions', color='red', linestyle='--')
plt.plot(y_test_pred_sentiment[:, 0], label='Sentiment Model Predictions', color='green', linestyle=':')


plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Create a scatter plot of actual vs predicted values (just 1st prediction from the 5 day prediction window)
plt.figure(figsize=(10, 6))
plt.scatter(y_test_actual[:, 0], y_test_pred[:, 0], alpha=0.5, label='Price Model')
plt.scatter(y_test_actual[:, 0], y_test_pred_sentiment[:, 0], alpha=0.5, label='Sentiment Model')

plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 'k--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices on Test Set')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Plot predictions for each of the 5 days
days = ['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5']
fig, axes = plt.subplots(5, 1, figsize=(12, 15), sharex=True)

for i in range(5):
    axes[i].plot(y_test_actual[:, i], label='Actual', color='blue')
    axes[i].plot(y_test_pred[:, i], label='Price Model', color='red', linestyle='--')
    axes[i].plot(y_test_pred_sentiment[:, i], label='Sentiment Model', color='green', linestyle=':')
    axes[i].set_title(f'Predictions for {days[i]}')
    axes[i].set_ylabel('Price ($)')
    axes[i].legend()
    axes[i].grid(True)

plt.xlabel('Time')
plt.tight_layout()
plt.show()


# Single Day Prediction:
Models trained (price only and sentiment + price) for predicting the stock price for a single day.

In [None]:
np.random.seed(42)
tf.random.set_seed(42)
# Create single-day target
y_single = df['close'].values


# fn to create sequences for LSTM
def create_sequences_single(data, targets, lookback=30):
    X_seq, y_seq = [], []
    for i in range(len(data) - lookback):
        X_seq.append(data[i:i+lookback])
        y_seq.append(targets[i+lookback])
    return np.array(X_seq), np.array(y_seq)

# scale the data
scaler_price_single = MinMaxScaler()
scaler_sentiment_single = MinMaxScaler()
scaler_y_single = MinMaxScaler()

# fit the scalers
X_price_scaled_single = scaler_price_single.fit_transform(X_price)
X_sentiment_scaled_single = scaler_sentiment_single.fit_transform(X_sentiment)
y_scaled_single = scaler_y_single.fit_transform(y_single.reshape(-1, 1))

# create sequences
X_price_seqs, y_price_seqs = create_sequences_single(X_price_scaled_single, y_scaled_single, lookback=30)
X_sentiment_seqs, y_sentiment_seqs = create_sequences_single(X_sentiment_scaled_single, y_scaled_single, lookback=30)

# split the data without shuffling (80% train, 20% test)
train_size_prices = int(len(X_price_seqs) * 0.8)
X_price_trains = X_price_seqs[:train_size_prices]
X_price_tests = X_price_seqs[train_size_prices:]
y_price_trains = y_price_seqs[:train_size_prices]
y_price_tests = y_price_seqs[train_size_prices:]

train_size_sentiments = int(len(X_sentiment_seqs) * 0.8)
X_sentiment_trains = X_sentiment_seqs[:train_size_sentiments]
X_sentiment_tests = X_sentiment_seqs[train_size_sentiments:]
y_sentiment_trains = y_sentiment_seqs[:train_size_sentiments]
y_sentiment_tests = y_sentiment_seqs[train_size_sentiments:]


# Model 1: Price only model
model_price_1 = Sequential([
    LSTM(64, return_sequences=True, input_shape=(30, X_price_trains.shape[2])),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1)
])

model_price_1.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Model 2: Including sentiment
model_with_sentiment_1 = Sequential([
    LSTM(32, return_sequences=True, input_shape=(30, X_sentiment_trains.shape[2])),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1)
])

model_with_sentiment_1.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train Model 1 (price only)
print("Training price-only model...")
history_price_1 = model_price_1.fit(
    X_price_trains, y_price_trains,
    epochs=150,
    batch_size=32,
    validation_split=0.2,

    verbose=1
)

# Train Model 2 (with sentiment)
print("Training model with sentiment...")
history_sentiment_1 = model_with_sentiment_1.fit(
    X_sentiment_trains, y_sentiment_trains,
    epochs=150,
    batch_size=32,
    validation_split=0.2,

    verbose=1
)

# model evaluation
price_mae_1 = model_price_1.evaluate(X_price_tests, y_price_tests)[1]
sentiment_mae_1 = model_with_sentiment_1.evaluate(X_sentiment_tests, y_sentiment_tests)[1]

print(f"Price-only model MAE: {price_mae_1:.4f}")
print(f"Sentiment model MAE: {sentiment_mae_1:.4f}")


# function to make predctions for the next day
def predict_next_day(model, recent_data, scaler_x, scaler_y):

    scaled_input = scaler_x.transform(recent_data)
    print(scaled_input.shape)

    # reshape to get one prediction
    reshaped_input = scaled_input.reshape(1, 30, scaled_input.shape[1])

    scaled_prediction = model.predict(reshaped_input)

    # inverse transform to get actual price
    prediction = scaler_y.inverse_transform(scaled_prediction)

    return prediction[0][0]

# last 30 days of data for prediction
latest_window_price = X_price[-30:]
latest_window_sentiment = X_sentiment[-30:]

# predictions with both models
price_prediction_1 = predict_next_day(model_price_1, latest_window_price, scaler_price_single, scaler_y_single)
sentiment_prediction_1 = predict_next_day(model_with_sentiment_1, latest_window_sentiment, scaler_sentiment_single, scaler_y_single)

print("\nPredictions for the next day:")
print(f"Price-only model: ${price_prediction_1:.2f}")
print(f"Sentiment model: ${sentiment_prediction_1:.2f}")

In [None]:
# plot the predictions vs actual values for test set
plt.figure(figsize=(12, 6))
plt.title('Stock Price (single day) Predictions on Test Set')
plt.xlabel('Time')
plt.ylabel('Price ($)')

y_test_actuals = scaler_y_single.inverse_transform(y_price_tests)

# price only model
y_test_preds = model_price_1.predict(X_price_tests)
y_test_preds = scaler_y_single.inverse_transform(y_test_preds)

# sentiment model
y_test_pred_sentiments = model_with_sentiment_1.predict(X_sentiment_tests)
y_test_pred_sentiments = scaler_y_single.inverse_transform(y_test_pred_sentiments)

# Plot actual vs predicted
plt.plot(y_test_actuals, label='Actual Prices', color='blue')
plt.plot(y_test_preds, label='Price Model Predictions', color='red', linestyle='--')
plt.plot(y_test_pred_sentiments, label='Sentiment Model Predictions', color='green', linestyle=':')

plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test_actuals[:, 0], y_test_preds[:, 0], alpha=0.5, label='Price Model')
plt.scatter(y_test_actuals[:, 0], y_test_pred_sentiments[:, 0], alpha=0.5, label='Sentiment Model')

plt.plot([y_test_actuals.min(), y_test_actuals.max()], [y_test_actuals.min(), y_test_actuals.max()], 'k--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices on Test Set (single day)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pickle

In [None]:
with open('model_with_sentiment_1.pkl', 'wb') as file:
  pickle.dump(model_with_sentiment_1, file)

In [None]:
with open('model_price_1.pkl', 'wb') as file:
  pickle.dump(model_price_1, file)

In [None]:
# pickle dump scalers for single day prediction
with open('scaler_price_single.pkl', 'wb') as file:
  pickle.dump(scaler_price_single, file)

with open('scaler_sentiment_single.pkl', 'wb') as file:
  pickle.dump(scaler_sentiment_single, file)

with open('scaler_y_single.pkl', 'wb') as file:
  pickle.dump(scaler_y_single, file)


_______________________________________________________________________________________

# Model Testing (in progress)

In [None]:
!pip install finnhub-python

In [None]:
import pandas as pd
import finnhub
from supabase import create_client
import asyncio
import aiohttp
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import numpy as np

In [None]:
import urllib.request, json
import datetime as dt

In [None]:
# MAIN FUNCTION
# %%writefile main.py
# saves python file in colab as main.py

from load_dotenv import load_dotenv
load_dotenv()

import pandas as pd
import os
import urllib.request, json
import datetime as dt
import finnhub
from supabase import create_client
import asyncio
import aiohttp
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import numpy as np

url= os.environ.get("SUPABASE_URL")
key= os.environ.get("SUPABASE_KEY")
supabase= create_client(url, key)

API_KEY = os.getenv("ALPHA_VANTAGE_API_KEY")
finnhub_api_key = os.getenv("FINNHUB_API_KEY")
finnhub_client = finnhub.Client(api_key=finnhub_api_key)

# Model and tokenizer (FinBERT) for sentiment analysis
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def get_test_stock_data(symbol):
  url_string = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=%s&outputsize=full&apikey=%s"%(symbol, alphavantage_key)
  with urllib.request.urlopen(url_string) as url:
        # read the data from the url
        data = json.loads(url.read().decode())

        # key = "Time Series (Daily)" only needs to be used
        data = data['Time Series (Daily)']

        # create a pandas dataframe with the data
        df = pd.DataFrame.from_dict(data, orient = 'index')
        df.reset_index(inplace = True)
        df.rename(columns={'index': 'date', '1. open': 'open', '2. high': 'high', '3. low': 'low', '4. close': 'close', '5. volume': 'volume'}, inplace=True)

        df['date'] = pd.to_datetime(df['date'])

        # get past 30 DAYS data
        thirty_days = dt.datetime.today() - dt.timedelta(days=30)
        df = df[df['date'] > thirty_days]

        df['open'] = df['open'].astype(float)
        df['high'] = df['high'].astype(float)
        df['low'] = df['low'].astype(float)
        df['close'] = df['close'].astype(float)
        df['volume'] = df['volume'].astype(int)

        # Sort the dataframe by date
        df.sort_values(by='date', ascending=True, inplace=True)

        df['ticker'] = symbol.upper()


        # print(df.head(5))

        return df

def fetch_news_test(symbol):
  to_str = (dt.datetime.now() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
  from_str = (dt.datetime.today() - dt.timedelta(days=31)).strftime('%Y-%m-%d')

  url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={from_str}&to={to_str}&token={finnhub_key}'
  with urllib.request.urlopen(url) as url:
    data = json.loads(url.read().decode())

  news_df = pd.DataFrame(data)
  news_df['ticker'] = symbol.upper()
  news_df['date'] = pd.to_datetime(news_df['datetime'], unit='s').dt.strftime('%Y-%m-%d')
  news_df = news_df[['ticker', 'date', 'headline', 'summary', 'url']]
  news_df.sort_values(by='date', ascending=True, inplace=True)
  news_df.reset_index(drop=True, inplace=True)
  return news_df


def sentiment_analysis_test(news_df):
  texts = (news_df['headline'].fillna('') + '. ' + news_df['summary'].fillna('')).tolist()
  sentiments = sentiment_pipeline(texts, padding = True, truncation=True, batch_size=16)
  scores = [
      r['score'] if r['label'] == 'positive' else -r['score']
      for r in sentiments
  ]
  return scores


In [None]:
def get_test_stock_data(symbol):
  url_string = "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=%s&outputsize=full&apikey=%s"%(symbol, alphavantage_key)
  with urllib.request.urlopen(url_string) as url:
        # read the data from the url
        data = json.loads(url.read().decode())

        # key = "Time Series (Daily)" only needs to be used
        data = data['Time Series (Daily)']

        # create a pandas dataframe with the data
        df = pd.DataFrame.from_dict(data, orient = 'index')
        df.reset_index(inplace = True)
        df.rename(columns={'index': 'date', '1. open': 'open', '2. high': 'high', '3. low': 'low', '4. close': 'close', '5. volume': 'volume'}, inplace=True)


        df['date'] = pd.to_datetime(df['date'])

        # get only past 30 DAYS data
        thirty_days = dt.datetime.today() - dt.timedelta(days=30)
        df = df[df['date'] > thirty_days]

        # convert rest of the columns to the appropriate data types
        df['open'] = df['open'].astype(float)
        df['high'] = df['high'].astype(float)
        df['low'] = df['low'].astype(float)
        df['close'] = df['close'].astype(float)
        df['volume'] = df['volume'].astype(int)

        # Sort the dataframe by date
        df.sort_values(by='date', ascending=True, inplace=True)

        df['ticker'] = symbol.upper()


        # print(df.head(5))

        return df

In [None]:
stock_df = get_test_stock_data('AAPL')

In [None]:
stock_df.tail(10)

In [None]:
stock_df.shape

In [None]:
# import os
# import datetime
# import pytz

# os.environ['TZ'] = 'America/Los_Angeles'
# dt.datetime.now(pytz.timezone(os.environ['TZ'])).strftime('%Y-%m-%d')

In [None]:
def fetch_news_test(symbol):
  to_str = (dt.datetime.now() - dt.timedelta(days=1)).strftime('%Y-%m-%d')
  from_str = (dt.datetime.today() - dt.timedelta(days=31)).strftime('%Y-%m-%d')

  url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={from_str}&to={to_str}&token={finnhub_key}'
  with urllib.request.urlopen(url) as url:
    data = json.loads(url.read().decode())

  news_df = pd.DataFrame(data)
  news_df['ticker'] = symbol.upper()
  news_df['date'] = pd.to_datetime(news_df['datetime'], unit='s').dt.strftime('%Y-%m-%d')
  news_df = news_df[['ticker', 'date', 'headline', 'summary', 'url']]
  news_df.sort_values(by='date', ascending=True, inplace=True)
  news_df.reset_index(drop=True, inplace=True)
  return news_df



In [None]:
import asyncio
import aiohttp
import nest_asyncio

# Apply nest_asyncio patch for Jupyter notebooks
nest_asyncio.apply()

In [None]:
# function to fetch news for a date chunk asynchronously
async def fetch_news_chunk_test(session, symbol, from_str, to_str):

    url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={from_str}&to={to_str}&token={finnhub_key}'
    try:
        async with session.get(url, timeout=10) as response:
            if response.status == 200:
                return await response.json()
            print(f"Error {response.status} for {from_str} to {to_str} for {symbol}")
            return []
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return []

# function that calls fetch_news_chunk asynchronously
async def fetch_30_days_test(session, symbol):

    end_date = dt.datetime.today()
    start_date = end_date - dt.timedelta(days=30)
    all_articles = []


    # Generate date chunks
    current_start = start_date
    while current_start <= end_date:
        current_end = min(current_start + dt.timedelta(days=5), end_date)
        from_str = current_start.strftime('%Y-%m-%d')
        to_str = current_end.strftime('%Y-%m-%d')
        # date_chunks.append((from_str, to_str))

        chunk = await fetch_news_chunk_test(session, symbol, from_str, to_str)

        if isinstance(chunk, list):
            all_articles.extend(chunk)
            print(f"{symbol}: Got {len(chunk)} articles from {from_str} to {to_str}")
            if len(chunk) >= 1000:
                print(f"Warning: Possible data truncation in chunk")

        current_start = current_end + dt.timedelta(days=1)

    df = pd.DataFrame(all_articles).drop_duplicates('id')
    df['datetime'] = pd.to_datetime(df['datetime'], unit='s', errors='coerce')
    df = df.dropna(subset=['datetime'])
    df = df[df['datetime'].between('1970-01-01', '2262-04-11')]
    df['ticker'] = symbol.upper()
    df['date'] = pd.to_datetime(df['datetime']).dt.date
    # df = df.groupby(['ticker', 'date']).head(10).reset_index(drop=True)
    # print(df.head(3))
    # print(df.iloc[0]['headline'])
    df = df[['ticker', 'id', 'datetime', 'headline', 'url', 'summary']].sort_values('datetime')
    return df

In [None]:
async def main(symbol):

  async with aiohttp.ClientSession() as session:
    news_df = await fetch_30_days_test(session, symbol)
    return news_df


symbol = 'AAPL'
news = asyncio.run(main(symbol))
print(news.head())

In [None]:
april_news = fetch_news_test('AAPL')

In [None]:
april_news.tail(5)

In [None]:
news = fetch_news_test('AAPL')

In [None]:


news.head(5)

In [None]:
# Model and tokenizer (FinBERT) for sentiment analysis
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
def sentiment_analysis_test(news_df):
  texts = (news_df['headline'].fillna('') + '. ' + news_df['summary'].fillna('')).tolist()
  sentiments = sentiment_pipeline(texts, padding = True, truncation=True, batch_size=16)
  scores = [
      r['score'] if r['label'] == 'positive' else -r['score']
      for r in sentiments
  ]
  return scores


In [None]:

scores = sentiment_analysis_test(news)


In [None]:
len(scores)

In [None]:
news['sentiment_score'] = scores
news['date'] = pd.to_datetime(news['datetime']).dt.date
daily_sentiment_test = news.groupby(['date', 'ticker'])['sentiment_score'].mean().reset_index()
daily_sentiment_test['num_articles'] = news.groupby(['ticker', 'date'])['date'].count().values
stock_df['date'] = pd.to_datetime(stock_df['date']).dt.date

# merge stock prices with daily average sentiment
test_df = pd.merge(stock_df, daily_sentiment_test, on=['ticker', 'date'], how='left')
test_df.rename(columns= {"sentiment_score" :"avg_sentiment"}, inplace=True)

test_df['avg_sentiment']= test_df['avg_sentiment'].fillna(0.0)  # Fill missing sentiment as neutral
test_df['num_articles'] = test_df['num_articles'].fillna(0)



# Adding more features related to sentiment (lagged sentiment, moving averages, etc.)
test_df['sentiment_ma3'] = test_df['avg_sentiment'].rolling(window=3).mean()
test_df['sentiment_lag1'] = test_df['avg_sentiment'].shift(1)
test_df['log_article_count'] = np.log(test_df['num_articles'] + 1)
test_df['sentiment_volatility'] = test_df['avg_sentiment'].rolling(window=7).std()
test_df['sentiment_close_corr'] = test_df['avg_sentiment'].rolling(5).corr(test_df['close'])

test_df.head(5)

In [None]:
test_df.info()

In [None]:
daily_sentiment_test.shape

In [None]:
# create a day and month column
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['day'] = test_df['date'].dt.day
test_df['month'] = test_df['date'].dt.month
# Adding a flag to check if there is a sentiment score or not
# based on the number of news articles, set the flag
test_df['has_sentiment'] = test_df['num_articles'].apply(lambda x: 1 if x>0 else 0)

In [None]:
test_df_price = test_df.drop(columns=['date','avg_sentiment', 'sentiment_ma3', 'sentiment_lag1', 'log_article_count', 'sentiment_volatility', 'sentiment_close_corr', 'has_sentiment'])
test_df_sentiment = test_df.drop(columns=['date','open', 'high', 'low', 'close', 'volume', 'day', 'month'])

In [None]:
# load model from pkl file
with open('/content/drive/MyDrive/model_price_1.pkl', 'rb') as file:
  model_price = pickle.load(file)

with open('/content/drive/MyDrive/model_with_sentiment_1.pkl', 'rb') as file:
  model_sentiment = pickle.load(file)

test_df_price = test_df_price.values
test_df_sentiment = test_df_sentiment.values


In [None]:
# Function to make predictions for the next 5 days
def predict_next_5_days_test(model, recent_data, scaler_x, scaler_y):

    scaled_input = scaler_x.transform(recent_data)

    reshaped_input = scaled_input.reshape(1, 21, scaled_input.shape[1])

    # Make prediction
    scaled_prediction = model.predict(reshaped_input)

    # Inverse transform to get actual prices
    prediction = scaler_y.inverse_transform(scaled_prediction)

    return prediction

In [None]:
# predict prices for next days
price_prediction = predict_next_5_days_test(model_price, test_df_price, scaler_price, scaler_y_single)
sentiment_price_prediction = predict_next_5_days_test(model_sentiment, test_df_sentiment, scaler_sentiment, scaler_y_single)

# print the results
for i in range(1):
  print(f"Day {i+1}: Price Prediction = {price_prediction[i][0]} ----|---- Sentiment Prediction = {sentiment_price_prediction[i][0]}")

In [None]:
# Define features for both models
price_features = ['open', 'high', 'low', 'close', 'volume', 'day', 'month']
sentiment_features = ['avg_sentiment', 'sentiment_ma3', 'sentiment_lag1', 'log_article_count', 'sentiment_volatility', 'has_sentiment']

# Create datasets
test_df_price = test_df[price_features].values
test_df_sentiment = test_df[price_features + sentiment_features].values

# scale the data
test_df_price_scaled = scaler_price.transform(test_df_price)
test_df_sentiment_scaled = scaler_sentiment.transform(test_df_sentiment)

# create sequences for LSTM
def create_sequences(data, lookback=21):
    X_seq = []
    for i in range(len(data) - lookback + 1):
        X_seq.append(data[i:i + lookback])
    return np.array(X_seq)

# sequences for the price only model
test_df_price_seq = create_sequences(test_df_price_scaled, lookback=21)

# sequences for the sentiment model
test_df_sentiment_seq = create_sequences(test_df_sentiment_scaled, lookback=21)

# predict prices for next days
price_prediction = predict_next_5_days_test(model_price, test_df_price, scaler_price, scaler_y_single)
sentiment_price_prediction = predict_next_5_days_test(model_sentiment, test_df_sentiment, scaler_sentiment, scaler_y_single)

price_prediction = scaler_y_single.inverse_transform(price_prediction)
sentiment_price_prediction = scaler_y_single.inverse_transform(sentiment_price_prediction)

# print the results
for i in range(len(price_prediction)):
    print(f"Day {i+1}: Price Prediction = {price_prediction[i][0]:.2f} ----|---- Sentiment Prediction = {sentiment_price_prediction[i][0]:.2f}")

In [None]:
test_df_price_seq.shape