In [1]:
import os
from datetime import date

import kaggle
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from statsmodels.tsa.api import SimpleExpSmoothing

rootpath = os.path.dirname(os.path.dirname(os.getcwd()))
print(f'rootpath: {rootpath}')

rootpath: /home/onur/WORK/DS/repos/TimeSeries/stocks_forecasting


In [2]:
datasetname = 'world-stock-prices-daily-updating'
datapath = os.path.join(rootpath, 'data')
if not os.path.exists(datapath):
    os.makedirs(datapath)

    dlcli = kaggle.api.dataset_download_cli(
        dataset=f'nelgiriyewithana/{datasetname}',
        unzip=True,
        path=os.path.join(datapath, datasetname))

In [3]:
raw_fpath = os.listdir(os.path.join(datapath, datasetname))[0]
raw_fpath_full = os.path.join(datapath, datasetname, raw_fpath)

df_raw = pd.read_csv(raw_fpath_full)
df_raw.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Brand_Name,Ticker,Industry_Tag,Country,Dividends,Stock Splits,Capital Gains
0,2025-07-03 00:00:00-04:00,6.63,6.74,6.615,6.64,4209664.0,peloton,PTON,fitness,usa,0.0,0.0,
1,2025-07-03 00:00:00-04:00,106.75,108.370003,106.330101,107.339996,560190.0,crocs,CROX,footwear,usa,0.0,0.0,
2,2025-07-03 00:00:00-04:00,122.629997,123.050003,121.550003,121.93,36600.0,adidas,ADDYY,apparel,germany,0.0,0.0,
3,2025-07-03 00:00:00-04:00,221.705002,224.009995,221.360001,223.410004,29295154.0,amazon,AMZN,e-commerce,usa,0.0,0.0,
4,2025-07-03 00:00:00-04:00,212.145004,214.649994,211.810104,213.550003,34697317.0,apple,AAPL,technology,usa,0.0,0.0,


In [4]:
df_clean = df_raw.copy()
df_clean['Date'] = pd.to_datetime(df_clean['Date'], utc=True).dt.tz_convert(None)
df_clean['Date'] = df_clean['Date'].dt.date
df_clean.drop_duplicates(subset=['Date', 'Ticker'], keep='first', inplace=True)
df_clean = df_clean[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]
df_clean.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
0,2025-07-03,6.63,6.74,6.615,6.64,4209664.0,PTON
1,2025-07-03,106.75,108.370003,106.330101,107.339996,560190.0,CROX
2,2025-07-03,122.629997,123.050003,121.550003,121.93,36600.0,ADDYY
3,2025-07-03,221.705002,224.009995,221.360001,223.410004,29295154.0,AMZN
4,2025-07-03,212.145004,214.649994,211.810104,213.550003,34697317.0,AAPL


In [18]:
df_clean_sample = df_clean[
    (df_clean['Ticker'].isin(['AAPL', 'AMZN' ])) & 
    (df_clean['Date'] >= date(2023, 1, 1))
     ].copy()
df_clean_sample.Date = pd.to_datetime(df_clean_sample['Date'])
print(f'sample shape: {df_clean_sample.shape}')
df_clean_sample.sort_values('Date', ascending=True).head()

# clean_sample_fpath_full = os.path.join(datapath, datasetname, 'clean_sample.csv')
# df_clean_sample.to_csv(clean_sample_fpath_full, index=False)


sample shape: (1242, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
41442,2023-01-03,85.459999,86.959999,84.209999,85.82,76706000.0,AMZN
41430,2023-01-03,129.726523,130.343884,123.64248,124.538658,112117500.0,AAPL
41348,2023-01-04,86.550003,86.980003,83.360001,85.139999,68885100.0,AMZN
41363,2023-01-04,126.350936,128.113421,124.548628,125.823189,89113600.0,AAPL
41294,2023-01-05,85.330002,85.419998,83.07,83.120003,67930800.0,AMZN


In [19]:
df = df_clean_sample.copy().sort_values(['Ticker', 'Date'])

# 0. define target variable
target = 'returns'

# 1. Load data and compute base features
df['returns'] = (df['Close'] - df['Open']) / df['Open']
# df['Adjusted_Yesterday'] = df['Adjusted'].shift(1)
# df['rel_return'] = (df['Adjusted'] - df['Adjusted_Yesterday']) / df['Adjusted_Yesterday']
df['volatility'] = (df['High'] - df['Low']) / ((df['High'] + df['Low']) / 2)
df['dollar_vol'] = df['Volume'] * ((df['Open'] + df['Close']) / 2)

# 2. Create next-day target and drop its NaNs early
df['target'] = df.groupby('Ticker')[target].shift(-1)
y_df = df[['Ticker', 'Date', 'target']].dropna().reset_index(drop=True)

# 3. Split train/test by last 20% of unique dates
all_days = pd.Series(df['Date'].dt.normalize().unique()).sort_values().values
split_date = all_days[int(len(all_days) * 0.8)]
train_df = df[df['Date'].dt.normalize() <= split_date].copy()
test_df  = df[df['Date'].dt.normalize() >  split_date].copy()

print(f'train shape: {train_df.shape}, test shape: {test_df.shape}')



train shape: (994, 11), test shape: (248, 11)


In [20]:
# 4. Feature engineering per ticker (keep Ticker & Date, drop raw features & target)
def build_features(df_in, target, beta_window=10, ma_windows=[10, 20, 60], ewm_alpha=[0.1, 0.3, 0], lags=1, out_len=500):
    feats = []
    for ticker, grp in df_in.groupby('Ticker'):
        g = grp.sort_values('Date').copy()
        g['Period'] = g['Date'].dt.to_period('D')
        g = g.set_index('Period')
        
        # Rolling rate of change over last 10 days
        g['Close_ROC10'] = g['Close'].pct_change(periods=10)

        # Moving averages
        for w in ma_windows:
            g[f'returns_ma{w}'] = g['returns'].rolling(w).mean()
            g[f'volatility_ma{w}'] = g['volatility'].rolling(w).mean()
            g[f'dollar_vol_ma{w}'] = g['dollar_vol'].rolling(w).mean()
        
        # Lag features
        for lag in range(1, lags + 1):
            g[f'returns_lag{lag}'] = g['returns'].shift(lag)
            g[f'volatility_lag{lag}'] = g['volatility'].shift(lag)
            g[f'dollar_vol_lag{lag}'] = g['dollar_vol'].shift(lag)
        
        # exponential smoothing
        for alpha in ewm_alpha:
            if alpha == 0:
                fit = SimpleExpSmoothing(df[target], initialization_method="estimated").fit()
                print(f'Optimal alpha for exponential smoothing: {fit.params["smoothing_level"]}')
                colname = f'{target}_EWMopt'
                g[colname] = fit.fittedvalues
            else:
                colname = f'{target}_EWM{alpha}'.replace('.', '')
                g[colname] = g[target].ewm(alpha=alpha).mean()

        # rolling correlations with indices
        for index_key in []: # 'dj', 'nasdaq', 'SP500'
            returns_index_col = f'{target}_{index_key}'
            
            rolling_cov = g[[target, returns_index_col]].rolling(window=beta_window).cov()
            
            cov = rolling_cov.loc[
                rolling_cov.index.get_level_values(1) == returns_index_col, target
                ].reset_index(drop=True)
            cov.index = g.index

            var = g[returns_index_col].rolling(window=beta_window).var()
            var.index = g.index
            
            g[f'Beta{beta_window}_{index_key}'] = cov / var

        # Trend + seasonality
        dp = DeterministicProcess(
            index=g.index, constant=False, order=0, seasonal=False,
            additional_terms=[
                CalendarFourier(freq='YE', order=1),
                CalendarFourier(freq='QE', order=1),
                CalendarFourier(freq='ME', order=1),
                CalendarFourier(freq='W',  order=1)
            ]
        )
        tf = dp.in_sample()
        
        # Merge and reset index
        g = g.reset_index().drop(columns=['Period'])
        merged = pd.concat([g.reset_index(drop=True), tf.reset_index(drop=True)], axis=1)

        # Drop rows with any NaNs
        merged = merged.dropna().reset_index(drop=True)
        
        # Retain only Ticker, Date, and engineered features
        merged['Ticker'] = ticker
        cols_to_drop = [
            'Open', 'High', 'Low', 'Close', 'Volume',
            'rel_return', 'volatility', 'dollar_vol', 'target'
        ]
        merged = merged.drop(columns=cols_to_drop, errors='ignore')
        
        feats.append(merged)
    return pd.concat(feats, ignore_index=True) if feats else pd.DataFrame()

# 5. Build feature tables
X_train_df = build_features(train_df, target, beta_window=10, ma_windows=[10, 20, 60], ewm_alpha=[0.1, 0.3, 0.5], lags=1, out_len=500)
X_test_df  = build_features(test_df, target, beta_window=10, ma_windows=[10, 20, 60], ewm_alpha=[0.1, 0.3, 0.5], lags=1, out_len=500)

print("X_train shape:", X_train_df.shape)
print("X_test  shape:", X_test_df.shape)


X_train shape: (876, 27)
X_test  shape: (128, 27)


In [21]:
# 6. Align X with y via merge to drop mismatches
train_merged = pd.merge(X_train_df, y_df, on=['Ticker','Date'], how='inner')
test_merged  = pd.merge(X_test_df,  y_df, on=['Ticker','Date'], how='inner')

# 7. Split into feature matrix and target vector
X_train = train_merged.drop(columns=['Ticker','Date','target'])
y_train = train_merged['target']
X_test  = test_merged.drop(columns=['Ticker','Date','target'])
y_test  = test_merged['target']

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test  shape:", X_test.shape)
print("y_test   shape:", y_test.shape)

X_train shape: (876, 25)
y_train shape: (876,)
X_test  shape: (128, 25)
y_test   shape: (128,)


In [25]:
# 8. Fit a Lasso regression model
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [26]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

# Predictions on train and test
y_train_pred = model.predict(X_train)
y_test_pred  = model.predict(X_test)

# Compute metrics
metrics = {
    'Train MAE': mean_absolute_error(y_train, y_train_pred),
    'Train RMSE': root_mean_squared_error(y_train, y_train_pred),
    'Train R2': r2_score(y_train, y_train_pred),
    'Test MAE': mean_absolute_error(y_test, y_test_pred),
    'Test RMSE': root_mean_squared_error(y_test, y_test_pred),
    'Test R2': r2_score(y_test, y_test_pred),
}

# Display metrics
for name, value in metrics.items():
    print(f"{name}: {value:.6f}")

Train MAE: 0.009295
Train RMSE: 0.012117
Train R2: 0.052404
Test MAE: 0.015389
Test RMSE: 0.025183
Test R2: -0.051506
