In [None]:
import os, sys

import kaggle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from statsmodels.tsa.seasonal import seasonal_decompose

rootpath = os.path.dirname(os.getcwd())

module_path = os.path.abspath(os.path.join(rootpath, 'src'))
sys.path.insert(0, module_path)
from style import plot_params  # in src folder


print(f'rootpath: {rootpath}')
datasetname = 'world-stock-prices-daily-updating'
datapath = os.path.join(rootpath, 'data')


## Functions

In [None]:
def extract_ticker(df_clean, ticker, requiredrecords=500, datapath=None, write=False,):
    if type(ticker) is not str:
        raise TypeError('ticker must be a string')
    print(f'Extracting for ticker: {ticker}')
    df_clean_sample = df_clean[
        (df_clean['Ticker'].isin([ticker])) #& 
        # (df_clean['Date'] >= date(2025, 1, 1))
        ].copy()
    # set date as index
    df_clean_sample.set_index('Date', inplace=True)
    # df_clean_sample = df_clean_sample.resample('D').mean()
    df_clean_sample.index = df_clean_sample.index.normalize()
    # drop unnecessary coluns
    df_clean_sample.drop(columns=['Ticker'], inplace=True)
    # drop rows with nans
    df_clean_sample = df_clean_sample.dropna()
    df_clean_sample = df_clean_sample.sort_values('Date', ascending=False)
    # take only the requested number of records from the latest period
    df_clean_sample = df_clean_sample.head(requiredrecords)
    print(f'Extracted sample shape: {df_clean_sample.shape}')
    
    if write:
        clean_sample_fpath_full = os.path.join(datapath, f'clean_sample_{ticker}.csv')
        df_clean_sample.to_csv(clean_sample_fpath_full, index=False)
        print(f'Wrote sample to: {clean_sample_fpath_full}')
    return df_clean_sample

def plot_seasonal_composition(df, feature):
    df=df.copy()
    if df.index[-1]<df.index[0]:
        #flip upside down
        df=df.iloc[::-1]
    decomposition = seasonal_decompose(
        df[feature], # .asfreq('D'),
        model='additive',
        period=5
        )
    fig = decomposition.plot()
    fig.set_size_inches(15, 10)
    # plt.show()
    # plt.title(feature)

def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("365D") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

## Data Preparation

In [None]:
os.makedirs(datapath, exist_ok=True)
if not os.path.exists(os.path.join(datapath, datasetname)):
    kaggle.api.dataset_download_files(
        dataset=f'nelgiriyewithana/{datasetname}',
        path=datapath,
        unzip=True)
else:
    print('Raw data already found in location {}'.format(datapath))

In [None]:
raw_fpath = os.listdir(os.path.join(datapath, datasetname))[0]
raw_fpath_full = os.path.join(datapath, datasetname, raw_fpath)

print(f'reading raw data from: {raw_fpath_full}')
df_raw = pd.read_csv(raw_fpath_full)
df_raw.head()

In [None]:
df_raw.head()

In [None]:
df_clean = df_raw.copy()
df_clean['Date'] = pd.to_datetime(df_clean['Date'], utc=True).dt.tz_convert(None)
# df_clean['Date'] = df_clean['Date'].dt.date
df_clean.drop_duplicates(subset=['Date', 'Ticker'], keep='first', inplace=True)
df_clean = df_clean[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]
df_clean.head()

## Example Ticker 1

In [None]:
# Load data and compute base features
df = extract_ticker(df_clean, 'AAPL', requiredrecords=500, write=False) # datapath=datapath
df['rel_return'] = (df['Close'] - df['Open']) / df['Open']
df['volatility'] = (df['High'] - df['Low']) / ((df['High'] + df['Low']) / 2)
df['dollar_vol'] = df['Volume'] * ((df['Open'] + df['Close']) / 2)
df.drop(columns=['Open', 'High', 'Low', 'Volume'], inplace=True)
df.head(10)

### Analysis of Time Series Components

In [None]:
# Seasonal decomposition
plot_seasonal_composition(df, 'Close')

In [None]:
plot_periodogram(df['Close'], detrend='linear')

In [None]:
# Fit a trend
y = df['Close'].copy()
dp = DeterministicProcess(
    index = y.index,
    order=3
)

# YOUR CODE HERE: Create the feature set for the dates given in y.index
X = dp.in_sample()
model = LinearRegression()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)

In [None]:
df['Close'].plot(**plot_params)
y_pred.plot(**plot_params, alpha=0.5)

In [None]:
y_resid = y-y_pred
y_resid.plot()

In [None]:
plot_periodogram(y-y_pred, detrend='constant')

### Feature Engineering

In [None]:
df.drop(columns=['Close'], inplace=True)

In [None]:
# train test split
all_days = pd.Series(df.index.sort_values().values)
split_date = all_days[int(len(all_days) * 0.8)]
train_df = df[df.index < split_date].copy()
test_df  = df[df.index >=  split_date].copy()
print(f'Train shape: {train_df.shape}, Test shape: {test_df.shape}')