#### **Imports**

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate

#### **Data preperation**

Read data files

In [2]:
stocks = pd.read_csv("../data/processed/Stocks4.csv")
news = pd.read_csv("../data/interim/News2.csv")

In [3]:
stocks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11164 entries, 0 to 11163
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            11164 non-null  object 
 1   open            11164 non-null  float64
 2   adj_close       11164 non-null  float64
 3   rolling_ma      11164 non-null  float64
 4   RSI             11164 non-null  float64
 5   MACD            11164 non-null  float64
 6   Signal_Line     11164 non-null  float64
 7   MACD_Histogram  11164 non-null  float64
 8   SMA_20          11164 non-null  float64
 9   upper_band      11164 non-null  float64
 10  lower_band      11164 non-null  float64
dtypes: float64(10), object(1)
memory usage: 959.5+ KB


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555 entries, 0 to 4554
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   publish_date         4555 non-null   object 
 1   headline_category    4555 non-null   object 
 2   headline_text        4555 non-null   object 
 3   main_category        4555 non-null   object 
 4   sub_category         3363 non-null   object 
 5   sentiment            4555 non-null   object 
 6   pos_score            4555 non-null   float64
 7   neg_score            4555 non-null   float64
 8   neu_score            4555 non-null   float64
 9   sentiment_label_num  4555 non-null   int64  
dtypes: float64(3), int64(1), object(6)
memory usage: 356.0+ KB


drop unnecessary columns from data

In [5]:
news.drop(columns = ['headline_category', 'main_category', 'sub_category', 'sentiment'], inplace = True)

In [6]:
stocks.head()

Unnamed: 0,date,open,adj_close,rolling_ma,RSI,MACD,Signal_Line,MACD_Histogram,SMA_20,upper_band,lower_band
0,1980-12-15,0.094005,0.093575,0.097789,0.0,-0.000411,-8.2e-05,-0.000329,0.098235,0.105519,0.090951
1,1980-12-16,0.087136,0.086707,0.095774,0.0,-0.001276,-0.000321,-0.000955,0.097137,0.110273,0.084001
2,1980-12-17,0.088853,0.088853,0.094516,17.932971,-0.001768,-0.00061,-0.001158,0.096348,0.109358,0.083339
3,1980-12-18,0.091429,0.091429,0.093955,34.260996,-0.001928,-0.000874,-0.001054,0.09588,0.107771,0.083989
4,1980-12-19,0.097009,0.097009,0.09451,56.093525,-0.001586,-0.001016,-0.00057,0.095988,0.106579,0.085396


In [7]:
news.head()

Unnamed: 0,publish_date,headline_text,pos_score,neg_score,neu_score,sentiment_label_num
0,2001-07-29,UTI grapples with potential Rs 1;700 cr pay-out,5.954629e-05,5.5e-05,0.999885,0
1,2001-09-17,Gujarat Samachar keeps markets guessing,1.412349e-05,0.001669,0.998317,0
2,2001-10-10,Sony to spend Rs 2.5 cr in ads for audios,2.925312e-07,0.000155,0.999844,0
3,2001-10-14,US firms grapple with rules after WTC attacks,0.0003226345,0.924971,0.074706,-1
4,2001-10-29,Govt to spend Rs 225cr on info kiosks in N-E,1.882406e-08,4e-06,0.999996,0


Concatenate data based on date column from stocks, which is equivalent to publish_date columns on the right

In [8]:
df = pd.merge(left=stocks, right=news, left_on="date", right_on="publish_date", sort='date')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3727 entries, 0 to 3726
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 3727 non-null   object 
 1   open                 3727 non-null   float64
 2   adj_close            3727 non-null   float64
 3   rolling_ma           3727 non-null   float64
 4   RSI                  3727 non-null   float64
 5   MACD                 3727 non-null   float64
 6   Signal_Line          3727 non-null   float64
 7   MACD_Histogram       3727 non-null   float64
 8   SMA_20               3727 non-null   float64
 9   upper_band           3727 non-null   float64
 10  lower_band           3727 non-null   float64
 11  publish_date         3727 non-null   object 
 12  headline_text        3727 non-null   object 
 13  pos_score            3727 non-null   float64
 14  neg_score            3727 non-null   float64
 15  neu_score            3727 non-null   f

In [10]:
df.tail()

Unnamed: 0,date,open,adj_close,rolling_ma,RSI,MACD,Signal_Line,MACD_Histogram,SMA_20,upper_band,lower_band,publish_date,headline_text,pos_score,neg_score,neu_score,sentiment_label_num
3722,2023-06-15,182.37525,41307.297363,40380.336207,80.07507,821.215688,731.108692,90.106996,39744.368189,42217.019695,37271.716683,2023-06-15,Apple to may make 18% of iPhones in India by t...,2.926116e-06,3e-06,0.999994,0
3723,2023-06-16,185.12137,41065.240234,40504.864212,71.421449,826.339499,750.154854,76.184645,39870.165527,42353.05055,37387.280503,2023-06-16,Why Twitter founder Jack Dorsey is not impress...,0.001738672,0.590583,0.407678,-1
3724,2023-06-21,183.307149,40852.058105,40654.32556,63.061677,791.573804,770.018893,21.554911,40068.378411,42477.369141,37659.387681,2023-06-21,Apple is changing the ad format for developers...,1.054115e-05,0.00637,0.993619,0
3725,2023-06-28,186.311024,42026.803711,41279.156945,73.821424,817.820628,792.664835,25.155793,40675.634642,43138.937128,38212.332155,2023-06-28,Here's what Apple has to say on UK law opposed...,4.154859e-07,5.7e-05,0.999943,0
3726,2023-06-30,189.979188,43074.981934,41728.123109,83.366198,912.655179,823.086132,89.569047,41027.07739,43764.603564,38289.551217,2023-06-30,Global macro; financial market risk perception...,0.003025393,0.996643,0.000332,-1


##### Prepare data

Set window size to 5, meaning we should consider past 5 values

In [11]:
window_size = 5

- Create Sequence of stocks data with requeired 
- The sequence contains past 5 data points for each record

In [12]:
x = [np.nan for i in range(window_size)]

for i in range(len(df) - window_size):
    x.append(df.iloc[i : i + window_size, [1, 3, 4, 5, 6, 7, 8, 9, 10]].values)

In [13]:
# store it in data back
df['sequence'] = x

In [14]:
# remove the null values (because of sequence)
df.dropna(inplace=True)

Save the data

In [15]:
df.to_csv("../Data/processed/final_data1.csv")

- Take only required columns as inputs and output
- X1 contains stock input which is simply the sequence we generated
- X2 contains the news data, which is the sentiment scores
- y contains the output data

In [16]:
X1 = df['sequence']
X2 = df[['pos_score', 'neg_score', "neu_score", 'sentiment_label_num']]

y = df['adj_close']

Split the data for training and test

In [17]:
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

Scale the data into relevant range for better performance

In [18]:
# for stock input data
stock_input_scaler = StandardScaler()
# for news input data
news_input_scaler = StandardScaler()
# for output data
target_scaler = StandardScaler()

In [19]:
# convert the data into numpy array
X1_array = np.array(X1_train.tolist())
samples, timesteps, features = X1_array.shape

# reshape to 2d since standard scaler won't work with 3d
X1_reshaped = X1_array.reshape((samples, timesteps * features))

X1_reshaped_scaled = stock_input_scaler.fit_transform(X1_reshaped)

# again reshape to 3d
X1_scaled = X1_reshaped_scaled.reshape((samples, timesteps, features))

X1_scaled.shape

(2977, 5, 9)

In [20]:
# scale news data
X2_array = X2_train.values

X2_scaled = news_input_scaler.fit_transform(X2_array)

X2_scaled.shape

(2977, 4)

In [21]:
# scale output
y_scaled = target_scaler.fit_transform(y_train.values.reshape(-1,1))

y_scaled.shape

(2977, 1)

In [22]:
X1_scaled.shape, X2_scaled.shape, y_scaled.shape

((2977, 5, 9), (2977, 4), (2977, 1))

##### Define model and train

model for time series data is imported from the previously train model

In [23]:
# for news input
time_series_input = Input(shape=(window_size, 9), name="time_series_input")
# model
time_series = load_model("../models/stock-prediction.h5", custom_objects={"mse": tf.keras.losses.MeanSquaredError()})
# pass input to model to get output
time_series_output = time_series(time_series_input)



For news data use dense layer which takes already processed sentiment scores as input

In [24]:
# FinBERT Sentiment Branch (Sentiment Scores + Label)
# Pos, Neg, Neu, Sentiment Label (one-hot encoded)
news_sentiment_input = Input(shape=(4,), name="news_sentiment_input") 
news_sentiment = Dense(16, activation="relu")(news_sentiment_input)

Merge the two models to get final output

In [25]:
# # Merge Both Inputs
merged = Concatenate()([time_series_output, news_sentiment])
merged = Dense(32, activation="relu")(merged)
merged = Dense(16, activation="relu")(merged)
final_output = Dense(1, activation="linear", name="predicted_stock_price")(merged)

Build and Compile the model

In [26]:
# # Build Model
model = tf.keras.Model(inputs=[time_series_input ,news_sentiment_input], outputs=final_output)
model.compile(optimizer="adam", loss = tf.keras.losses.MeanSquaredError())

# # Model Summary
model.summary()


##### Training the model

In [27]:
h = model.fit([X1_scaled, X2_scaled], y_scaled, epochs=10)

Epoch 1/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.6734
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0132
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0093
Epoch 4/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0030
Epoch 5/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0042
Epoch 6/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0024
Epoch 7/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0034
Epoch 8/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0031
Epoch 9/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0034
Epoch 10/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0060


##### Model Evaluation

Scale Test inputs and outputs

In [28]:
# for test stock inputs
X1_test_array = np.array(X1_test.tolist())
samples, timesteps, features = X1_test_array.shape

X1_reshaped = X1_test_array.reshape((samples, timesteps * features))

X1_reshaped_scaled = stock_input_scaler.transform(X1_reshaped)

X1_test_scaled = X1_reshaped_scaled.reshape((samples, timesteps, features))

X1_test_scaled.shape

(745, 5, 9)

In [29]:
# for test news inputs
X2_test_array = X2_test.values

X2_test_scaled = news_input_scaler.transform(X2_test_array)

X2_test_scaled.shape

(745, 4)

In [30]:
# for test outputs
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1,1))

y_test_scaled.shape

(745, 1)

evaluate model (gives error)

In [31]:
model.evaluate([X1_test_scaled, X2_test_scaled], y_test_scaled)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0031 


0.0033175956923514605

Check R2_Score

In [32]:
r2_score(model.predict([X1_test_scaled, X2_test_scaled]), y_test_scaled)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


0.9969121941292477

In [33]:
target_scaler.inverse_transform(y_test_scaled[15].reshape(1,-1))

array([[1937.59265137]])

In [34]:
pred = model.predict([X1_test_scaled[15].reshape(1,timesteps, features), X2_test_scaled[10].reshape(1,-1)])
target_scaler.inverse_transform(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


array([[2000.5894]], dtype=float32)

##### Save the model

In [35]:
model.save("../models/final_model.h5")



In [36]:
model.save_weights("../models/final.weights.h5")

In [37]:
import joblib

In [38]:
joblib.dump(stock_input_scaler, '../models/artifacts/stock_scaler.pkl')
joblib.dump(news_input_scaler, '../models/artifacts/news_scaler.pkl')
joblib.dump(target_scaler, '../models/artifacts/target_scaler.pkl')

['../models/artifacts/target_scaler.pkl']