### Loading Data

In [None]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

!pip install pandas-ta
import pandas_ta as ta
import math

In [None]:
# Define a list with the given ticker symbols
klci_stocks = [
    '5681.KL', '3182.KL', '1066.KL', '5285.KL', '5296.KL', '1295.KL',
    '1082.KL', '5183.KL', '7084.KL', '1155.KL', '5819.KL', '1015.KL',
    '4863.KL', '5225.KL', '4715.KL', '5347.KL', '1961.KL', '1023.KL',
    '6888.KL', '4707.KL', '4065.KL', '6947.KL', '6012.KL', '2445.KL',
    '3816.KL', '4197.KL', '4677.KL', '6033.KL', '6742.KL', '8869.KL'
]

# Display the list
print(klci_stocks)

In [None]:
#Extract data of stocks listed in KLCI index from yfinance from January 2022 to March 2024

end_date = '2024-03-31'

start_date = '2022-01-01'

df = yf.download(tickers=klci_stocks,
                 start=start_date,
                 end=end_date).stack()
#set date and ticker as index
df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df


In [None]:
#Check details and if any missing values
df.info()
df.isnull().sum()

## Calculate Technical Indicator



In [None]:
df['garman_klass_vol'] = np.sqrt((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

##################can try adjust length
df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df

In [None]:
# to normalize the ATR value
df.drop('atr',axis='columns')

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    #return atr
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

## Aggregate to monthly level

In [None]:
#To reduce training time and experiment with features and strategies
#convert the business-daily data to month-end frequency.

last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open',
                                                          'high', 'low', 'close','daily return']]

monthdata = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   df.unstack()[last_cols].resample('M').last().stack('ticker')],
                  axis=1)).dropna()

monthdata = monthdata.drop(['dollar_volume'],axis=1)

In [None]:
monthdata

## Monthly Returns for Different Time Horizon

In [None]:
def calculate_returns(df):

    outlier_cutoff = 0.005

    lags = [1, 3, 6, 9]

    for lag in lags:

        df[f'return_{lag}m'] = (df['adj close']
                              .pct_change(lag)
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                     upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1))
    return df
    
    
monthdata = monthdata.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()
monthdata

## FAMA-FRENCH FIVE FACTOR


In [None]:
factor_data = web.DataReader('Emerging_5_Factors',
                               'famafrench',
                               start='2022')[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()

factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'date'

factor_data = factor_data.join(monthdata['return_1m']).sort_index()

factor_data

In [None]:
#calculate rolling factor betas

betas = (factor_data.groupby(level=1,
                            group_keys=False)
         .apply(lambda x: RollingOLS(endog=x['return_1m'], 
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(6, x.shape[0]),
                                     min_nobs=len(x.columns))
         .fit(params_only=True)
         .params
         .drop('const', axis=1)))

betas

In [None]:
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

proc_data = (monthdata.join(betas.groupby('ticker').shift()))

proc_data.loc[:, factors] = proc_data.groupby('ticker', group_keys=False)[factors].apply(lambda x: x.fillna(x.mean()))

proc_data = proc_data.drop('adj close', axis=1)

proc_data = proc_data.dropna()

proc_data.info()

proc_data