# Forecasting: Stock Prediction 📊

Datasets source: Yahoo finance [Yfinance python library](https://pypi.org/project/yfinance/)

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import plotly.express as px
import plotly.io as pio
import dask
import warnings
import yfinance as yf
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from datetime import date
from datetime import timedelta

today = date.today()
print("Today is: ", today)

yesterday = today - timedelta(days = 1)
print("Yesterday was: ", yesterday)

warnings.filterwarnings('ignore')
pio.renderers.default = 'vscode'

px.defaults.template = "plotly_dark"

Today is:  2024-12-26
Yesterday was:  2024-12-25


## Download datasets from Yahoo Finannce

In [2]:
df_apple = yf.download('AAPL', start='2020-01-01', end=yesterday)
df_samsung = yf.download('005930.KS', start='2020-01-01', end=yesterday)
df_xiaomi = yf.download('1810.HK', start='2020-01-01', end=yesterday)
df_nvidia = yf.download('NVDA', start='2020-01-01', end=yesterday)
df_google = yf.download('GOOG', start='2020-01-01', end=yesterday)
df_amazon = yf.download('AMZN', start='2020-01-01', end=yesterday)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
def edit_df(df):
    """change column's name and reset index"""
    df.columns = [c[0] for c in df.columns]
    df = df.reset_index()
    return df

In [4]:
df_apple = edit_df(df_apple)
df_samsung = edit_df(df_samsung)
df_xiaomi = edit_df(df_xiaomi)
df_nvidia = edit_df(df_nvidia)
df_google = edit_df(df_google)
df_amazon = edit_df(df_amazon)

In [5]:
df_apple

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,72.796021,72.856613,71.545387,71.799873,135480400
1,2020-01-03,72.088295,72.851761,71.862892,72.020432,146322800
2,2020-01-06,72.662704,72.701485,70.953995,71.206062,118387200
3,2020-01-07,72.320984,72.929329,72.100426,72.672417,108872000
4,2020-01-08,73.484352,73.787315,72.022858,72.022858,132079200
...,...,...,...,...,...,...
1249,2024-12-18,248.050003,254.279999,247.740005,252.160004,56774100
1250,2024-12-19,249.789993,252.000000,247.089996,247.500000,60882300
1251,2024-12-20,254.490005,255.000000,245.690002,248.039993,147495300
1252,2024-12-23,255.270004,255.649994,253.449997,254.770004,40858800


In [6]:
df_samsung

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,48825.464844,49533.080276,48648.560986,49090.820631,12993228
1,2020-01-03,49090.828125,50063.799493,48560.116470,49533.087838,15422255
2,2020-01-06,49090.828125,49179.280068,48294.760642,48560.116470,10278951
3,2020-01-07,49356.179688,49886.891297,49179.275818,49267.727753,10009778
4,2020-01-08,50240.699219,50771.410830,49444.631802,49709.987607,23501171
...,...,...,...,...,...,...
1221,2024-12-18,54900.000000,55400.000000,54000.000000,54100.000000,13698937
1222,2024-12-19,53100.000000,53800.000000,53100.000000,53500.000000,22481925
1223,2024-12-20,53000.000000,53100.000000,51900.000000,52700.000000,24674774
1224,2024-12-23,53500.000000,54000.000000,53300.000000,53400.000000,13672650


In [7]:
df_xiaomi

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,11.220000,11.220000,10.780000,10.860000,248889455
1,2020-01-03,10.900000,11.440000,10.900000,11.200000,219547199
2,2020-01-06,10.980000,11.180000,10.640000,10.720000,179639996
3,2020-01-07,11.000000,11.300000,10.920000,11.060000,171287427
4,2020-01-08,11.080000,11.280000,10.820000,10.820000,189314498
...,...,...,...,...,...,...
1222,2024-12-18,30.799999,30.900000,30.049999,30.450001,86680411
1223,2024-12-19,30.799999,31.049999,30.000000,30.200001,85886142
1224,2024-12-20,31.650000,32.099998,30.700001,30.700001,204413958
1225,2024-12-23,31.750000,32.250000,31.450001,32.000000,97720054


In [8]:
df_nvidia

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,5.972711,5.972711,5.893294,5.943832,237536000
1,2020-01-03,5.877111,5.920927,5.828066,5.852962,205384000
2,2020-01-06,5.901757,5.906985,5.757612,5.783752,262636000
3,2020-01-07,5.973209,6.019017,5.885078,5.930139,314856000
4,2020-01-08,5.984412,6.025739,5.928895,5.968976,277108000
...,...,...,...,...,...,...
1249,2024-12-18,128.910004,136.699997,128.279999,133.860001,277444500
1250,2024-12-19,130.679993,134.029999,129.550003,131.759995,209719200
1251,2024-12-20,134.699997,135.279999,128.220001,129.809998,306528600
1252,2024-12-23,139.669998,139.789993,135.119995,136.279999,176053500


In [9]:
df_google

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,68.123726,68.162086,66.837348,66.837348,28132000
1,2020-01-03,67.789421,68.379304,67.036329,67.151713,23728000
2,2020-01-06,69.460922,69.575007,67.258334,67.258334,34646000
3,2020-01-07,69.417580,69.898350,69.270107,69.646760,30054000
4,2020-01-08,69.964615,70.326314,69.293024,69.354799,30560000
...,...,...,...,...,...,...
1249,2024-12-18,190.149994,198.690002,189.279999,196.830002,27638400
1250,2024-12-19,189.699997,194.600006,189.520004,193.279999,26981200
1251,2024-12-20,192.960007,194.134995,186.369995,187.009995,45319700
1252,2024-12-23,195.990005,196.490005,191.630005,194.029999,15235900


In [10]:
df_amazon

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,94.900497,94.900497,93.207497,93.750000,80580000
1,2020-01-03,93.748497,94.309998,93.224998,93.224998,75288000
2,2020-01-06,95.143997,95.184502,93.000000,93.000000,81236000
3,2020-01-07,95.343002,95.694504,94.601997,95.224998,80898000
4,2020-01-08,94.598503,95.550003,94.321999,94.902000,70160000
...,...,...,...,...,...,...
1249,2024-12-18,220.520004,231.399994,220.110001,230.770004,43281400
1250,2024-12-19,223.289993,226.089996,222.919998,224.910004,39918700
1251,2024-12-20,224.919998,226.210007,218.729996,219.839996,88279200
1252,2024-12-23,225.059998,226.880005,223.899994,225.009995,28070000


### Datasets are made of the following:
- **Open:** Opening stock price of the day
- **Close:** Closing stock price of the day
- **High:** Highest stock price of the data
- **Low:** Lowest stock price of the day
- **Volume:** total number of shares traded during a specific period

## Data visualisation: lines and candlestick graphs

In [11]:
def lines_graph(df, sym):
    """lines graphs"""

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['Date'],
                             y=df['Close'],
                             mode='lines',
                             name='Close Price'))

    fig.update_layout(
        title=f"Closing Price of {((yf.Ticker(sym)).info)['longName']} Stock",
        xaxis_title="Date",
        yaxis_title="Close Price ($)",
        height=500,
        margin=dict(t=50, b=50)
    )
    fig.show()

In [12]:
lines_graph(df_apple, 'AAPL')
lines_graph(df_samsung, '005930.KS')
lines_graph(df_xiaomi, '1810.HK')
lines_graph(df_nvidia, 'NVDA')
lines_graph(df_google, 'GOOG')
lines_graph(df_amazon, 'AMZN')


In [13]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_apple['Date'],
                         y=df_apple['Close'].interpolate(),
                         mode='lines',
                         name='Apple Close Price'))
fig.add_trace(go.Scatter(x=df_samsung['Date'],
                         y=df_samsung['Close'].interpolate(),
                         mode='lines',
                         name='SamsungClose Price'))
fig.add_trace(go.Scatter(x=df_xiaomi['Date'],
                         y=df_xiaomi['Close'].interpolate(),
                         mode='lines',
                         name='Xiaomi Close Price'))
fig.add_trace(go.Scatter(x=df_nvidia['Date'],
                         y=df_nvidia['Close'].interpolate(),
                         mode='lines',
                         name='Nvidia Close Price'))
fig.add_trace(go.Scatter(x=df_google['Date'],
                            y=df_google['Close'].interpolate(),
                            mode='lines',
                            name='Google Close Price'))
fig.add_trace(go.Scatter(x=df_amazon['Date'],
                            y=df_amazon['Close'].interpolate(),
                            mode='lines',
                            name='Amazon Close Price'))


fig.update_layout(
    title="Closing Price of Apple, Xiaomi, Samsung, Nvidia, Google, and Amazon Stock",
    xaxis_title="Date",
    yaxis_title="Close Price ($)",
    height=500,
    margin=dict(t=50, b=50))


fig.show()


In [14]:
def candlestick_graph(df, sym):
    """candlestick graph"""
    fig = go.Figure(data=go.Candlestick(x=df['Date'], open=df[f'Open'],  high=df[f'High'],
                    low=df[f'Low'],
                    close=df[f'Close']))
    fig.update_layout(
        title=f"{((yf.Ticker(sym)).info)['longName']} stock candlestick chart ($)",
        height=500,
        margin=dict(t=50, b=50)
    )

    fig.show()

In [15]:
candlestick_graph(df_apple, "AAPL")
candlestick_graph(df_samsung, '005930.KS')
candlestick_graph(df_xiaomi, '1810.HK')
candlestick_graph(df_nvidia, 'NVDA')
candlestick_graph(df_google, 'GOOG')
candlestick_graph(df_amazon, 'AMZN')


## Train-Test split

We split the datasets as following:
- **10%** for test 
- **90%** for train

In [16]:
# split datasets to train and test
def split_data(df):
    """split datasets to train and test"""
    train_size = int(len(df) * 0.9)
    test_size = len(df) - train_size
    train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
    return train, test

train_apple, test_apple = split_data(df_apple)
train_samsung, test_samsung = split_data(df_samsung)
train_xiaomi, test_xiaomi = split_data(df_xiaomi)
train_nvidia, test_nvidia = split_data(df_nvidia)
train_google, test_google = split_data(df_google)
train_amazon, test_amazon = split_data(df_amazon)


In [17]:
def train_test_graph(df, sym, train_data, test_data):
    """train and test graph"""
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train_data['Date'],
                            y=train_data['Close'],
                            mode='lines',
                            name='Train close Price'))
    fig.add_trace(go.Scatter(x=test_data['Date'],
                            y=test_data['Close'],
                            mode='lines',
                            name='Test close Price'))

    fig.update_layout(
        title=f"Closing Price of {((yf.Ticker(sym)).info)['longName']} Stock ($)",
        xaxis_title="Date",
        yaxis_title="Close Price ($)",
        height=500,
        margin=dict(t=50, b=50)
    )
    fig.show()

train_test_graph(df_apple, 'AAPL', train_apple, test_apple)
train_test_graph(df_samsung, '005930.KS', train_samsung, test_samsung)
train_test_graph(df_xiaomi, '1810.HK', train_xiaomi, test_xiaomi)
train_test_graph(df_nvidia, 'NVDA', train_nvidia, test_nvidia)
train_test_graph(df_google, 'GOOG', train_google, test_google)
train_test_graph(df_amazon, 'AMZN', train_amazon, test_amazon)

## Normalization: MinMax Scaling

In [58]:
def normalize_data(train, test):
    """normalize data"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(np.array(train['Close']).reshape(-1, 1))
    test = scaler.transform(np.array(test['Close']).reshape(-1, 1))

    return train, test, scaler


normalized_train_apple, normalized_test_apple, scaler_apple = normalize_data(train_apple, test_apple)
normalized_train_samsung, normalized_test_samsung, scaler_samsung = normalize_data(train_samsung, test_samsung)
normalized_train_xiaomi, normalized_test_xiaomi, scaler_xiaomi = normalize_data(train_xiaomi, test_xiaomi)
normalized_train_nvidia, normalized_test_nvidia, scaler_nvidia = normalize_data(train_nvidia, test_nvidia)
normalized_train_google, normalized_test_google, scaler_google = normalize_data(train_google, test_google)
normalized_train_amazon, normalized_test_amazon, scaler_amazon = normalize_data(train_amazon, test_amazon)


In [59]:
def create_sequences(data, sequence_length=30):
    """Create sequences and targets for LSTM"""
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)


X_train_apple, y_train_apple = create_sequences(normalized_train_apple)
X_test_apple, y_test_apple = create_sequences(normalized_test_apple)
X_train_samsung, y_train_samsung = create_sequences(normalized_train_samsung)
X_test_samsung, y_test_samsung = create_sequences(normalized_test_samsung)
X_train_xiaomi, y_train_xiaomi = create_sequences(normalized_train_xiaomi)
X_test_xiaomi, y_test_xiaomi = create_sequences(normalized_test_xiaomi)
X_train_nvidia, y_train_nvidia = create_sequences(normalized_train_nvidia)
X_test_nvidia, y_test_nvidia = create_sequences(normalized_test_nvidia)
X_train_google, y_train_google = create_sequences(normalized_train_google)
X_test_google, y_test_google = create_sequences(normalized_test_google)
X_train_amazon, y_train_amazon = create_sequences(normalized_train_amazon)
X_test_amazon, y_test_amazon = create_sequences(normalized_test_amazon)


In [60]:
def evaluation(y_test, y_pred, history):
    """evaluat the model"""
    loss_df = pd.DataFrame({
        'Epochs': range(1, len(history.history['loss']) + 1),
        'Training Loss': history.history['loss'],
        'Validation Loss': history.history['val_loss']
    })

    fig = px.line(loss_df, x='Epochs', y=['Training Loss', 'Validation Loss'],
                  labels={'value': 'Loss', 'variable': 'Loss Type'},
                  title='Training and Validation Loss')

    fig.update_layout(legend_title_text='Loss Type')

    fig.show()

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")

## Build a neural network

**LSTM architectures (Long Short Term Memory)** are capable of learning long-term dependencies in sequential data, which makes them well-suited for **time series forecasting**.

In [70]:
def LSTM_model(input_shape, X_train, y_train, X_test, y_test, modelname):
    """Create and train the LSTM model"""
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(loss='mean_squared_error', optimizer='adam')
    history = model.fit(
        X_train, y_train,
        epochs=50, batch_size=32,
        validation_split=0.1, verbose=1
    )

    # Predictions
    y_pred = model.predict(X_test)
    evaluation(y_test, y_pred, history)
    model.save(f"{modelname}_model.keras")


In [71]:
LSTM_model(
    (X_train_apple.shape[1], X_train_apple.shape[2]),
    X_train_apple, y_train_apple,
    X_test_apple, y_test_apple, 'apple'
)

Epoch 1/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 0.1197 - val_loss: 0.0029
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0073 - val_loss: 0.0031
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0049 - val_loss: 0.0026
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0044 - val_loss: 0.0026
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.0042 - val_loss: 0.0022
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.0043 - val_loss: 0.0020
Epoch 7/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0041 - val_loss: 0.0020
Epoch 8/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.0047 - val_loss: 0.0023
Epoch 9/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━

Mean Absolute Error: 0.050958393355072774
Mean Squared Error: 0.0031400630884301953
R2 Score: 0.09204862465638763
