In [84]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

from statsmodels.tsa.seasonal import STL


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2019-2024-us-stock-market-data/Stock Market Dataset.csv


# Step 1. Data Import & EDA

## Data Loading & Initial Data Cleansing

In [85]:
df = pd.read_csv("/kaggle/input/2019-2024-us-stock-market-data/Stock Market Dataset.csv")

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,02-02-2024,2.079,,72.28,,3.8215,,43194.7,42650.0,...,589498,10580.0,564.64,4030000.0,171.81,117220000.0,474.99,84710000.0,2053.7,
1,1,01-02-2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690.0,...,581600,9780.0,567.51,3150000.0,159.28,66360000.0,394.78,25140000.0,2071.1,260920.0
2,2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480.0,...,578020,9720.0,564.11,4830000.0,155.2,49690000.0,390.14,20010000.0,2067.4,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130.0,...,584680,9750.0,562.85,6120000.0,159.0,42290000.0,400.06,18610000.0,2050.9,214590.0
4,4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230.0,...,578800,13850.0,575.79,6880000.0,161.26,42840000.0,401.02,17790000.0,2034.9,1780.0


Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,02-02-2024,2.079,,72.28,,3.8215,,43194.7,42650.0,...,589498,10580.0,564.64,4030000.0,171.81,117220000.0,474.99,84710000.0,2053.7,
1,1,01-02-2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690.0,...,581600,9780.0,567.51,3150000.0,159.28,66360000.0,394.78,25140000.0,2071.1,260920.0
2,2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480.0,...,578020,9720.0,564.11,4830000.0,155.2,49690000.0,390.14,20010000.0,2067.4,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130.0,...,584680,9750.0,562.85,6120000.0,159.0,42290000.0,400.06,18610000.0,2050.9,214590.0
4,4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230.0,...,578800,13850.0,575.79,6880000.0,161.26,42840000.0,401.02,17790000.0,2034.9,1780.0


In [None]:
df.info()

### Check the formatting of those non-numeric columns

In [None]:
df[['Date', 'Bitcoin_Price', 'Platinum_Price', 'Ethereum_Price', 'S&P_500_Price', 'Nasdaq_100_Price', 'Berkshire_Price', 'Gold_Price']].head(5)

In [None]:
for col in ['Bitcoin_Price', 'Platinum_Price', 'Ethereum_Price', 'S&P_500_Price', 'Nasdaq_100_Price', 'Berkshire_Price', 'Gold_Price']:
    
    df[col] = df[col].str.replace(',','')
    
df = df.astype({'Bitcoin_Price': 'float64', 'Platinum_Price': 'float64', 'Ethereum_Price': 'float64', 
                'S&P_500_Price': 'float64', 'Nasdaq_100_Price': 'float64', 'Berkshire_Price': 'float64', 'Gold_Price': 'float64'})

In [None]:
df.info()

## Data Grouping

In [None]:
df.columns

In [None]:
_commodity_price = ['Natural_Gas_Price', 'Crude_oil_Price', 'Copper_Price', 'Platinum_Price', 'Silver_Price', 'Gold_Price']
_commodity_vol   = ['Natural_Gas_Vol.', 'Crude_oil_Vol.', 'Copper_Vol.', 'Platinum_Vol.', 'Silver_Vol.', 'Gold_Vol.']
_crypto_price    = ['Bitcoin_Price', 'Ethereum_Price']
_crypto_vol      = ['Bitcoin_Vol.', 'Ethereum_Vol.']
_index_price     = ['S&P_500_Price', 'Nasdaq_100_Price', 'Berkshire_Price']
_index_vol       = ['Nasdaq_100_Vol.', 'Berkshire_Vol.']
_techs_price     = ['Apple_Price', 'Tesla_Price', 'Microsoft_Price', 'Google_Price', 'Nvidia_Price', 'Netflix_Price', 'Amazon_Price', 'Meta_Price']
_techs_vol       = ['Apple_Vol.', 'Tesla_Vol.', 'Microsoft_Vol.', 'Google_Vol.', 'Nvidia_Vol.', 'Netflix_Vol.', 'Amazon_Vol.', 'Meta_Vol.']

### `Date`

In [None]:
df["Date"] = pd.to_datetime(df["Date"], format='%d-%m-%Y')

In [None]:
df = df.set_index("Date")\
    .drop(columns=df.columns[0])\
    .sort_index()

### `Commodity`

In [None]:
df[_commodity_price].describe(include = 'all')

In [None]:
fig = px.line(df, y=_commodity_price, title='Commodity Price Time Series', width = 1000, height = 400)

fig.update_xaxes(
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [None]:
df[_commodity_vol].describe(include = 'all')

In [None]:
fig = px.line(df, y=_commodity_vol, title='Commodity Vol. Time Series', width = 1000, height = 400)

fig.update_xaxes(
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [None]:
df_platinum_subset = (df.index >= '2020-08-01') & (df.index <= '2023-12-01')

In [None]:
plt.plot(df[df_platinum_subset]["Platinum_Vol."].fillna(df["Platinum_Vol."].mean()))

In [None]:
plt.plot(df["Platinum_Vol."].interpolate(method = 'polynomial', order = 1))

In [None]:
stl_object = STL(df[df_platinum_subset]["Platinum_Vol."].fillna(df["Platinum_Vol."].mean()), period = 91).fit()
stl_object.plot()
plt.show()