<a href="https://colab.research.google.com/github/Saifullah785/deep-learning-projects/blob/main/Project_03_forecasting_financial_market/Project_03_forecasting_financial_market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Stock Market Forecasting with ARIMA, SARIMA , SARIMAX | A complete Project A-Z**

#**Stock Market Data scraping in Python**


In [23]:
# import Libraries
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import yfinance as yf
import datetime as dt
from datetime import datetime, timedelta
import plotly.graph_objects as go
import plotly.express as px
#

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA

In [24]:
# Get today's date
dt.date.today()

datetime.date(2025, 7, 8)

In [25]:
# define the dates to fetch data
today = dt.date.today()
d1 = today.strftime("%Y-%m-%d")
end_date = d1
d2 = dt.date.today() - timedelta(days=365)
d2 = d2.strftime("%Y-%m-%d")
start_date = d2
# Print the date range for the data
print("Your data will be between these dates:",start_date, end_date)

Your data will be between these dates: 2024-07-08 2025-07-08


In [26]:
# define the ticker symbol for the stock
ticker = 'GOOGL' #DX-Y.NYB

# Download historical stock data using yfinance
df = yf.download(ticker, start=start_date, end=end_date, progress=False)
# Display the first few rows of the DataFrame
df.head()


YF.download() has changed argument auto_adjust default to True



Price,Close,High,Low,Open,Volume
Ticker,GOOGL,GOOGL,GOOGL,GOOGL,GOOGL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-07-08,188.118744,189.253248,186.87477,188.984545,21035900
2024-07-09,188.06897,190.437501,187.810229,189.39256,15121400
2024-07-10,190.258377,190.825637,188.118748,188.238164,15952500
2024-07-11,184.67543,189.939922,184.187787,188.934796,25625800
2024-07-12,184.177841,186.208,183.600635,184.187788,22898400


In [27]:
# Display information about the DataFrame, including data types and non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 250 entries, 2024-07-08 to 2025-07-07
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   (Close, GOOGL)   250 non-null    float64
 1   (High, GOOGL)    250 non-null    float64
 2   (Low, GOOGL)     250 non-null    float64
 3   (Open, GOOGL)    250 non-null    float64
 4   (Volume, GOOGL)  250 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 11.7 KB


In [28]:
# # Insert the Date index as a new column (commented out)
# df.insert(0, 'Date', df.index, True)

In [29]:
# Add the Date index as a new column
df['Date'] = df.index
# Reset the index to a default integer index and drop the old index
df.reset_index(drop=True, inplace=True)

In [30]:
# Display information about the DataFrame after adding the Date column and resetting the index
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   (Close, GOOGL)   250 non-null    float64       
 1   (High, GOOGL)    250 non-null    float64       
 2   (Low, GOOGL)     250 non-null    float64       
 3   (Open, GOOGL)    250 non-null    float64       
 4   (Volume, GOOGL)  250 non-null    int64         
 5   (Date, )         250 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 11.8 KB


In [31]:
# Display the first few rows of the DataFrame after modifications
df.head()

Price,Close,High,Low,Open,Volume,Date
Ticker,GOOGL,GOOGL,GOOGL,GOOGL,GOOGL,Unnamed: 6_level_1
0,188.118744,189.253248,186.87477,188.984545,21035900,2024-07-08
1,188.06897,190.437501,187.810229,189.39256,15121400,2024-07-09
2,190.258377,190.825637,188.118748,188.238164,15952500,2024-07-10
3,184.67543,189.939922,184.187787,188.934796,25625800,2024-07-11
4,184.177841,186.208,183.600635,184.187788,22898400,2024-07-12


In [32]:
# Display the column names of the DataFrame
df.columns

MultiIndex([( 'Close', 'GOOGL'),
            (  'High', 'GOOGL'),
            (   'Low', 'GOOGL'),
            (  'Open', 'GOOGL'),
            ('Volume', 'GOOGL'),
            (  'Date',      '')],
           names=['Price', 'Ticker'])

In [33]:
# Flatten the MultiIndex columns into single-level column names
df.columns = [''.join(col).strip() for col in df.columns.values]

In [34]:
# Display information about the DataFrame after flattening column names
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   CloseGOOGL   250 non-null    float64       
 1   HighGOOGL    250 non-null    float64       
 2   LowGOOGL     250 non-null    float64       
 3   OpenGOOGL    250 non-null    float64       
 4   VolumeGOOGL  250 non-null    int64         
 5   Date         250 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 11.8 KB


In [35]:
# Display the first few rows of the DataFrame after flattening column names
df.head()

Unnamed: 0,CloseGOOGL,HighGOOGL,LowGOOGL,OpenGOOGL,VolumeGOOGL,Date
0,188.118744,189.253248,186.87477,188.984545,21035900,2024-07-08
1,188.06897,190.437501,187.810229,189.39256,15121400,2024-07-09
2,190.258377,190.825637,188.118748,188.238164,15952500,2024-07-10
3,184.67543,189.939922,184.187787,188.934796,25625800,2024-07-11
4,184.177841,186.208,183.600635,184.187788,22898400,2024-07-12


In [36]:
# Select only the 'Date' and 'Close' columns
df = df[['Date','CloseGOOGL']]
# Display the first few rows of the filtered DataFrame
df.head()

Unnamed: 0,Date,CloseGOOGL
0,2024-07-08,188.118744
1,2024-07-09,188.06897
2,2024-07-10,190.258377
3,2024-07-11,184.67543
4,2024-07-12,184.177841


In [37]:
# Display the shape (number of rows and columns) of the DataFrame
df.shape

(250, 2)

In [38]:
# Create a line plot of the 'CloseGOOGL' price over time using Plotly Express
fig = px.line(df, x='Date', y='CloseGOOGL', title='Time Series Data')
# Display the plot
fig.show()