In [2]:
import pandas as pd
csv_file_path = "csv_files/AAPL.csv"

df = pd.read_csv(csv_file_path)
print(df)

            Date       Close        High         Low        Open      Volume
0     2005-01-03    0.952312    0.979698    0.941930    0.974732   691992000
1     2005-01-04    0.962092    0.985113    0.947496    0.959835  1096810400
2     2005-01-05    0.970519    0.981804    0.963748    0.969916   680433600
3     2005-01-06    0.971271    0.976688    0.952914    0.973076   705555200
4     2005-01-07    1.041991    1.047709    0.974280    0.978042  2227450400
...          ...         ...         ...         ...         ...         ...
5082  2025-03-17  214.000000  215.220001  209.970001  213.309998    48073400
5083  2025-03-18  212.690002  215.149994  211.490005  214.160004    42432400
5084  2025-03-19  215.240005  218.759995  213.750000  214.220001    54385400
5085  2025-03-20  214.100006  217.490005  212.220001  213.990005    48862900
5086  2025-03-21  218.270004  218.839996  211.279999  211.559998    93954500

[5087 rows x 6 columns]


In [3]:
df.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,5087.0,5087.0,5087.0,5087.0,5087.0
mean,53.360119,53.888072,52.777897,53.319053,382341500.0
std,65.626727,66.25569,64.916162,65.559166,395275300.0
min,0.952312,0.976688,0.94193,0.959835,23234700.0
25%,6.170992,6.235994,6.095307,6.180172,97918500.0
50%,22.578592,22.780177,22.361551,22.576409,222731600.0
75%,70.200199,71.221916,69.363726,69.910319,550701200.0
max,258.735504,259.814335,257.347047,257.906429,3372970000.0


In [4]:
import plotly.express as pr

df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = (df['Date'].dt.year // 5) * 5

print(df)

           Date       Close        High  ...        Open      Volume  Year
0    2005-01-03    0.952312    0.979698  ...    0.974732   691992000  2005
1    2005-01-04    0.962092    0.985113  ...    0.959835  1096810400  2005
2    2005-01-05    0.970519    0.981804  ...    0.969916   680433600  2005
3    2005-01-06    0.971271    0.976688  ...    0.973076   705555200  2005
4    2005-01-07    1.041991    1.047709  ...    0.978042  2227450400  2005
...         ...         ...         ...  ...         ...         ...   ...
5082 2025-03-17  214.000000  215.220001  ...  213.309998    48073400  2025
5083 2025-03-18  212.690002  215.149994  ...  214.160004    42432400  2025
5084 2025-03-19  215.240005  218.759995  ...  214.220001    54385400  2025
5085 2025-03-20  214.100006  217.490005  ...  213.990005    48862900  2025
5086 2025-03-21  218.270004  218.839996  ...  211.559998    93954500  2025

[5087 rows x 7 columns]


In [5]:
fig_zero = pr.box(df, x = 'Year', y = 'Close', 
             title = 'Historical Stock Growth of AAPL', 
             labels = {'Year' : 'Five-Year Periods', 'Close' : 'Price'}, 
             color = 'Year')

fig_zero.show()

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_year_based = df[df['Date'].dt.year == 2024]
df_year_based.drop('Year', axis = 1, inplace = True)

cols = ['Open', 'High', 'Low', 'Close']
fig = make_subplots(rows = 4, cols = 1, subplot_titles = cols)
for i, col in enumerate(cols):
    fig.add_trace(go.Scatter(x = df_year_based['Date'], y = df[col], mode = 'lines+markers', name = col), row = i + 1, col = 1)
    fig.update_layout(
        title = '2024 AAPL Trends',
        xaxis_title = 'Month',
        yaxis_title = 'Price',
        height = 1600,
        showlegend = False
    )
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
volume_df = df_year_based[['Date', 'Volume']]
fig = go.Figure()
fig.add_trace(go.Scatter(x = volume_df['Date'], y = volume_df['Volume'], mode = 'lines', fill = 'tozeroy', name = 'Stocks Density'))
fig.update_layout(
    title = "Volumes of AAPL",
    xaxis_title = 'Date',
    yaxis_title = 'Stocks Volume'
)
fig.show()

In [8]:
from statsmodels.tsa.stattools import adfuller
import numpy as np

test_df = df.copy()
test_df.set_index('Date', inplace = True)
test_df.drop(columns = ['Year'], inplace = True)
print(test_df)

# Before Log-Transformation
cols = ['Close', 'High', 'Low', 'Open']
for col in cols:
    res = adfuller(test_df[col])
    print(res[1])

# After Log-Transformation
log_df = np.log(test_df[cols]).diff()
test_df[cols] = log_df
test_df.dropna(inplace = True)
for col in cols:
    res = adfuller(test_df[col], regression = 'c')
    print(res[1])

                 Close        High         Low        Open      Volume
Date                                                                  
2005-01-03    0.952312    0.979698    0.941930    0.974732   691992000
2005-01-04    0.962092    0.985113    0.947496    0.959835  1096810400
2005-01-05    0.970519    0.981804    0.963748    0.969916   680433600
2005-01-06    0.971271    0.976688    0.952914    0.973076   705555200
2005-01-07    1.041991    1.047709    0.974280    0.978042  2227450400
...                ...         ...         ...         ...         ...
2025-03-17  214.000000  215.220001  209.970001  213.309998    48073400
2025-03-18  212.690002  215.149994  211.490005  214.160004    42432400
2025-03-19  215.240005  218.759995  213.750000  214.220001    54385400
2025-03-20  214.100006  217.490005  212.220001  213.990005    48862900
2025-03-21  218.270004  218.839996  211.279999  211.559998    93954500

[5087 rows x 5 columns]
0.9976569602580203
0.9947947963050687
0.996840499109

In [9]:
test_df.index = pd.to_datetime(test_df.index)
test_df_year = test_df[test_df.index.year == 2024]

cols = ['Close', 'High', 'Low', 'Open']
fig = make_subplots(rows = 4, cols = 1, subplot_titles = cols)
for i, col in enumerate(cols):
    fig.add_trace(go.Scatter(x = test_df_year.index, y = test_df_year[col], mode = 'lines+markers', name = col), row = i + 1, col = 1)
    fig.update_layout(
        title = '2024 AAPL Transformed Trends',
        xaxis_title = 'Month',
        yaxis_title = 'Price',
        height = 1600,
        showlegend = False
    )
fig.show()

In [10]:
# Daily ACF and PACF Plots
from statsmodels.tsa.stattools import acf, pacf

lags = 40

acf_values = acf(test_df['Close'], nlags = lags)
pacf_values = pacf(test_df['Close'], nlags = lags)

fig = make_subplots(rows = 2, cols = 1, subplot_titles = ['Autocorrelation Function', 'Partial Autocorrelation Function'])
fig.add_trace(go.Bar(x = list(range(lags + 1)) , y = acf_values, marker_color = 'blue'), row = 1, col = 1)
fig.add_trace(go.Bar(x = list(range(lags + 1)), y = pacf_values, marker_color = 'red'), row = 2, col = 1)
fig.show()

p, d, q = (4, 1, 4)

In [11]:
# Seasonal ACF and PACF Plots
from statsmodels.tsa.stattools import acf, pacf

lags = 40

acf_values = acf(test_df['Close'].diff(252).dropna(), nlags = lags)
pacf_values = pacf(test_df['Close'].diff(252).dropna(), nlags = lags)

fig = make_subplots(rows = 2, cols = 1, subplot_titles = ['Autocorrelation Function', 'Partial Autocorrelation Function'])
fig.add_trace(go.Bar(x = list(range(lags + 1)) , y = acf_values, marker_color = 'blue'), row = 1, col = 1)
fig.add_trace(go.Bar(x = list(range(lags + 1)), y = pacf_values, marker_color = 'red'), row = 2, col = 1)
fig.show()

P, D, Q, s = (4, 1, 4, 252)

In [None]:
import pmdarima as pm

model = pm.auto_arima(test_df['Close'], seasonal = True, m = 12, trace = True, suppress_warnings = True, stepwise = True)
print(model.summary())

# To find the optimal values for order_parameters and seasonal_parameters

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[12] intercept   : AIC=-25232.352, Time=11.82 sec
 ARIMA(0,0,0)(0,0,0)[12] intercept   : AIC=-25235.493, Time=0.46 sec
 ARIMA(1,0,0)(1,0,0)[12] intercept   : AIC=-25238.645, Time=1.94 sec
 ARIMA(0,0,1)(0,0,1)[12] intercept   : AIC=-25238.710, Time=2.14 sec
 ARIMA(0,0,0)(0,0,0)[12]             : AIC=-25223.334, Time=0.15 sec
 ARIMA(0,0,1)(0,0,0)[12] intercept   : AIC=-25236.470, Time=0.26 sec
 ARIMA(0,0,1)(1,0,1)[12] intercept   : AIC=-25236.869, Time=1.95 sec
 ARIMA(0,0,1)(0,0,2)[12] intercept   : AIC=-25236.374, Time=6.34 sec
 ARIMA(0,0,1)(1,0,0)[12] intercept   : AIC=-25238.712, Time=1.80 sec
 ARIMA(0,0,1)(2,0,0)[12] intercept   : AIC=-25236.767, Time=4.49 sec
 ARIMA(0,0,1)(2,0,1)[12] intercept   : AIC=-25234.829, Time=3.28 sec
 ARIMA(0,0,0)(1,0,0)[12] intercept   : AIC=-25237.820, Time=1.18 sec
 ARIMA(1,0,1)(1,0,0)[12] intercept   : AIC=-25236.206, Time=2.79 sec
 ARIMA(0,0,2)(1,0,0)[12] intercept   : AIC=-25238.112, Time

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

order = (0, 0, 1)
seasonal = (1, 0, [], 12)
new_close_model = SARIMAX(test_df['Close'], order = order, seasonal_order = seasonal)
results = new_close_model.fit()
print(results.summary())


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



                                      SARIMAX Results                                      
Dep. Variable:                               Close   No. Observations:                 5086
Model:             SARIMAX(0, 0, 1)x(1, 0, [], 12)   Log Likelihood               12616.394
Date:                             Wed, 26 Mar 2025   AIC                         -25226.788
Time:                                     10:51:15   BIC                         -25207.186
Sample:                                          0   HQIC                        -25219.924
                                            - 5086                                         
Covariance Type:                               opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.0213      0.009     -2.383      0.017      -0.039      -0.004
ar.S.L12       0.0318      

In [14]:
new_high_model = SARIMAX(test_df['High'], order = order, seasonal_order = seasonal)
results = new_high_model.fit()
print(results.summary())


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



                                      SARIMAX Results                                      
Dep. Variable:                                High   No. Observations:                 5086
Model:             SARIMAX(0, 0, 1)x(1, 0, [], 12)   Log Likelihood               13510.349
Date:                             Wed, 26 Mar 2025   AIC                         -27014.697
Time:                                     10:54:00   BIC                         -26995.094
Sample:                                          0   HQIC                        -27007.833
                                            - 5086                                         
Covariance Type:                               opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1          0.1347      0.010     13.261      0.000       0.115       0.155
ar.S.L12       0.0302      