In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from qids_package.qids import *
import lightgbm as lgb
!pip install /kaggle/input/d/qiwenshi/ta-lib/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
import talib as ta
import pykalman 
from scipy.fft import fft,fftfreq

Processing /kaggle/input/d/qiwenshi/ta-lib/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
Installing collected packages: talib-binary
Successfully installed talib-binary-0.4.19
[0m

In [2]:
env = make_env()  # initialize the environment

Environment is initialized.


In [3]:
fun_data = pd.read_csv('/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_fundamental_data.csv')
mark_data = pd.read_csv("/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_market_data.csv")
return_data = pd.read_csv("/kaggle/input/hku-qids-2023-quantitative-investment-competition/first_round_train_return_data.csv")

In [4]:
def merge_data(market,fundamental):
    # make your prediction Y here and replace the following four rows
    market['date'] = market.apply(lambda x: x['date_time'].split('p')[0], axis=1)
    fundamental.rename(columns={"date_time": "date"}, inplace=True)
    market = pd.merge(market, fundamental, on='date')
    market = market.drop("date", axis='columns')
    market['stock'] = market.apply(lambda x: x['date_time'].split('d')[0].split('s')[1], axis=1)
    market['day'] = market.apply(lambda x: x['date_time'].split('p')[0], axis=1)
    market['time'] = market.apply(lambda x: x['date_time'].split('p')[1], axis=1)
    market['days'] = market.apply(lambda x: x['day'].split('d')[1], axis=1)
    market = market.drop("day", axis='columns')
    market['stock'] = market['stock'].astype('int')
    market['days'] = market['days'].astype('int')
    market['time'] = market['time'].astype('int')    
    market = market.sort_values(by=['stock', 'days','time'], ascending=[True, True,True])
    return market

def subtract_from_previous_50(row,feature,full_data):
    if int(row.time) % 50 == 0 and int(row.time) >= 50:
        return row[feature] - full_data.iloc[row.name - 49][feature]
    else:
        return row[feature]
def preprocess(full_data):
    full_data['stock_day'] = full_data.apply(lambda x: x['date_time'].split('p')[0], axis=1)
    grouped = full_data.groupby('stock') 
    result = pd.DataFrame()
    for stock, group in grouped:    
        features = ['money','volume','close']
        # Finding the diff between p1 and p50
        for feature in features:
            group[feature+'_rsi'] = ta.RSI(group[feature], 50)
            group[feature+'_sma'] = ta.SMA(group[feature], 50)
            group[feature+'_slope'] = ta.LINEARREG_SLOPE(group[feature], timeperiod = 50)
        group['ADX'] = ta.ADX(group['high'], group['low'], group['close'], 50)
        slowk, slowd = ta.STOCH(group['high'], group['low'], group['close'], 
                           fastk_period=5, slowk_period=14, slowk_matype=0, slowd_period=14, slowd_matype=0)

        group['slowk'] = slowk
        group['slowd'] = slowd
        macd, signal, hist = ta.MACD(group['close'], fastperiod=10, slowperiod=20, signalperiod=5)
        group['macd'] = macd
        group['signal'] = signal
        group['hist'] = hist
        group['dema'] = ta.DEMA(group['close'],timeperiod = 50)
        #group['slope'] = ta.LINEARREG_SLOPE(group['slope'], timeperiod = 50)
        group['cci'] = ta.CCI(group['high'], group['low'], group['close'], 50)
        
        group['DX'] = ta.DX(group['high'], group['low'], group['close'], 50)
        
        group['plus_di'] = ta.PLUS_DI(group['high'], group['low'], group['close'], 50)
        
        group['plus_dm'] = ta.PLUS_DM(group['high'], group['low'], 50)
        features = ['money','volume','low','high','close','open']
        # Finding the gap between p1 and p50
        group = group.reset_index(drop=True)
        for feature in features:
            group[feature+'_gap'] = group.apply(lambda x: subtract_from_previous_50(x,feature,group), axis=1)
        group = group.loc[group['time'] == 50]
        result = result.append(group, ignore_index=True)
    
    result['EPS'] = result['close']/result['pe']
    
    return result

def split_complex_number(z):
    return pd.Series({'real': np.real(z), 'imag': np.imag(z)})
def filter_return(train_return):
    grouped = train_return.groupby('stock') 
    result = pd.DataFrame()
    for stock, group in grouped:
        returns = np.log(group['close']/group['close'].shift(1))
        
        N = len(group)
        
        yf = fft(returns.fillna(0).values)
        kf = pykalman.KalmanFilter(initial_state_mean=0, n_dim_obs=1,
                                   n_dim_state=1)
        kf_state_means, _ = kf.filter(returns.values)
    
        group['fourier_returns'] = yf
        group['kalman_returns'] = pd.Series(kf_state_means.flatten(), index=group.index)
        result = result.append(group, ignore_index=True)
    

    # Apply the function to the 'complex_numbers' column
    result[['fourier_real', 'fourier_ima']] = result['fourier_returns'].apply(split_complex_number)
    result = result.drop("fourier_returns", axis='columns')
    
    return result

In [5]:
fundamental_df = pd.read_csv('/kaggle/input/hku-qids-2023-quantitative-investment-competition/qids_package/first_round_test_fundamental_data.csv')
market_df = pd.read_csv('/kaggle/input/hku-qids-2023-quantitative-investment-competition/qids_package/first_round_test_market_data.csv')

# append the correlated dataset to train dataset
fun_data = fun_data.append(fundamental_df)
mark_data = mark_data.append(market_df)

# Merging the required dataset
full_data = merge_data(mark_data,fun_data)
# Starting feature engineering
full_data_return = preprocess(full_data)
# Spliting training and testing data
full_data_return = filter_return(full_data_return)
full_data_return_train = full_data_return.loc[full_data_return.days < 1001]
full_data_return_predict = full_data_return.loc[full_data_return.days >= 1001]

full_data_return_train = full_data_return_train.drop("date_time", axis='columns')
full_data_return_train.rename(columns={"stock_day": "date_time"}, inplace=True)

In [6]:
train_return = pd.merge(full_data_return_train, return_data, on='date_time')
train_return = train_return.set_index('date_time')

In [7]:
xtrain = train_return.loc[:, train_return.columns != 'return']
ytrain = train_return['return']

params = {
      'objective': 'rmse',  
      'boosting_type': 'dart',
      'tree_learner': 'feature',
      'n_jobs': -1,
      'verbose': -1
    }
train_dataset = lgb.Dataset(xtrain, ytrain)
model = lgb.train(params = params,  
                  train_set = train_dataset,
                          num_boost_round = 200, 
                          verbose_eval = False
                          )

full_data_return_predict = full_data_return_predict.drop("date_time", axis='columns')
full_data_return_predict.rename(columns={"stock_day": "date_time"}, inplace=True)
test = full_data_return_predict.set_index('date_time')
predict = model.predict(test)
test['return'] = predict
test = test.reset_index()
test[['date_time', 'return']].to_csv('submission.csv',index = False)  # upload your predicted Y



In [8]:
import pandas as pd
print(pd.read_csv('/kaggle/working/submission.csv'))

      date_time    return
0       s0d1001 -0.008123
1       s0d1002 -0.006532
2       s0d1003 -0.007853
3       s0d1004 -0.009205
4       s0d1005 -0.007505
...         ...       ...
37795  s53d1696 -0.005409
37796  s53d1697 -0.004042
37797  s53d1698 -0.005365
37798  s53d1699 -0.002419
37799  s53d1700 -0.001851

[37800 rows x 2 columns]


In [9]:
print(dir(ta))

['ACOS', 'AD', 'ADD', 'ADOSC', 'ADX', 'ADXR', 'APO', 'AROON', 'AROONOSC', 'ASIN', 'ATAN', 'ATR', 'AVGPRICE', 'BBANDS', 'BETA', 'BOP', 'CCI', 'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI', 'CDLENGULFING', 'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE', 'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI', 'CDLHARAMICROSS', 'CDLHIGHWAVE', 'CDLHIKKAKE', 'CDLHIKKAKEMOD', 'CDLHOMINGPIGEON', 'CDLIDENTICAL3CROWS', 'CDLINNECK', 'CDLINVERTEDHAMMER', 'CDLKICKING', 'CDLKICKINGBYLENGTH', 'CDLLADDERBOTTOM', 'CDLLONGLEGGEDDOJI', 'CDLLONGLINE', 'CDLMARUBOZU', 'CDLMATCHINGLOW', 'CDLMATHOLD', 'CDLMORNINGDOJISTAR', 'CDLMORNINGSTAR', 'CDLONNECK', 'CDLPIERCING', 'CDLRICKSHAWMAN', 'CDLRISEFALL3METHODS', 'CDLSEPAR