In [1]:
# !pip install finance-datareader
# !pip install statsmodels==0.12.2

In [2]:
import os
import warnings

import numpy as np
import statsmodels
import pandas as pd
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
from statsmodels.tsa.arima_model import ARIMA

warnings.filterwarnings(action='ignore')

In [3]:
# ARIMA is deprecated in higher versions
print(statsmodels.__version__)

0.12.2


In [4]:
class StockPredictor:
    def __init__(self):
        self.submission_path = './sample_submission.csv'
        self.stock_list_path = './stock_list.csv'
        self.final_submission_path = './final_submission_last.csv'
        self.figure_path = './figures'

    def load_file(self):
        self.submission = pd.read_csv(self.submission_path)
        self.stock_list = pd.read_csv(self.stock_list_path)
        self.stock_list['종목코드'] = self.stock_list['종목코드'].apply(lambda x: str(x).zfill(6))

        self.codes = np.sort(self.stock_list['종목코드'].values)      

        self.public_submission = self.submission.iloc[:5]
        self.public_submission_form = self.public_submission.copy()
        self.private_submission = self.submission.iloc[5:]        
        
    def load_data(self, start_date='2021-01-01', end_date='2021-11-26'):
        # price information
        dfs = []
        for code in tqdm(self.codes):
            df = fdr.DataReader(code, start=start_date, end=end_date)
            df['Code'] = code
            dfs.append(df)
        self.df_marcap_backup = pd.concat(dfs)
        
        # sector information
        krx = fdr.StockListing('KRX')
        krx = krx[krx['Symbol'].isin(self.codes)]
        krx = krx[['Symbol', 'Sector']]
        krx['HighSector'] = krx['Sector'].str.split(' ').str[-1]
        krx = krx.reset_index(drop=True)
        krx.loc[krx['HighSector'].isna(), 'HighSector'] = 'Others'
        krx.loc[krx['HighSector'] == '제외', 'HighSector'] = '제조업'
        self.krx = krx
        self.sectors = self.krx.HighSector.unique()
        
    def set_dates(self, training_date='2021-10-29', start_date='2021-11-1', end_date='2021-11-5', public=True):
        self.training_date = training_date
        self.start_date = start_date
        self.end_date = end_date
        
        self.df_marcap = self.df_marcap_backup.loc[:self.training_date]
        
        if public:
            self.public_test()
        
    def public_test(self):
        self.public_answer = self.public_submission_form.copy()
        public_pivot = self.df_marcap_backup.loc[self.start_date:self.end_date]
        public_pivot = public_pivot.reset_index().pivot('Date', 'Code', 'Close')
        
        for code in self.codes:
            if code in public_pivot.columns:
                self.public_answer.loc[:,code] = public_pivot.loc[:,code].values

        # validation check
        # self.score((self.public_answer.set_index('Day')*0.5).reset_index())
        self.baseline_submission = self.public_submission_form.copy()

        baseline_pivot = self.df_marcap.loc[self.training_date].reset_index().pivot('Date', 'Code', 'Close')
        for code in self.codes:
            if code in baseline_pivot.columns:
                 self.baseline_submission.loc[:,code] = baseline_pivot.loc[:,code].values[0]
                    
    def score(self, df_preds):
        trues = self.public_answer.set_index('Day').replace(0, np.nan).values
        preds = df_preds.set_index('Day').values

        return np.nanmean(np.abs(trues - preds) / trues, axis=1)
    
    def predict_ratio_arima(self, log_means):
        model = ARIMA(log_means, order=(0,1,1))
        model_fit = model.fit(trend='nc',full_output=True, disp=1)
        fore = model_fit.forecast(steps=5)
        return fore[0] - log_means.values[-1]
        
    def compare_to_baseline(self):
        public_scores = self.score(self.public_submission)
        baseline_scores = self.score(self.baseline_submission)
        
        print(f'public_scores: {np.mean(public_scores)}, {public_scores}')
        print(f'baseline_scores: {np.mean(baseline_scores)}. {baseline_scores}')       
        return np.mean(public_scores), np.mean(baseline_scores)
        
    def train(self, global_ratio=0.1, sector_ratio=0.3, company_ratio=0.6, public=True):
        pivot = self.df_marcap.reset_index().pivot('Date', 'Code', 'Close').fillna(method='bfill')
        vals = pivot.values
        nor_pivot = pivot.copy()
        nor_pivot.iloc[:,:] = vals / vals[0]

        # global
        print('train globally...')
        log_means_global = np.log(nor_pivot.mean(axis=1))
        forecasts_global = self.predict_ratio_arima(log_means_global)

        dict_global_corr = dict()
        for code in self.codes:
            corr = log_means_global.corr(nor_pivot.loc[:,code])
            dict_global_corr[code] = corr

        # by sector
        print('train by sector...')
        forecasts_sector = []
        dict_sector_corr = dict()
        for sector in self.sectors:
            symbols = self.krx.loc[self.krx['HighSector'] == sector, 'Symbol'].values

            df = nor_pivot.loc[:,symbols]
            log_means = np.log(df.mean(axis=1))
            pred = self.predict_ratio_arima(log_means)               
            forecasts_sector.append(pred)

            for code in symbols:
                corr = log_means.corr(nor_pivot.loc[:,code])
                dict_sector_corr[code] = corr
     
        # by company
        print('train by company...')
        dict_company_pred = dict()
        for code in tqdm(self.codes):
            if code in self.krx['Symbol'].unique():
                log_means = np.log(nor_pivot[code])
                pred = self.predict_ratio_arima(log_means)             
                dict_company_pred[code] = pred

        self.ratios = dict()
        for code in self.codes:
            if code in self.krx['Symbol'].unique():
                sector = self.krx.loc[self.krx['Symbol'] == code, 'HighSector'].values[0]
                if sector == 'Others':
                    ratio = np.exp(global_ratio*dict_global_corr[code]*forecasts_global 
                                          + (company_ratio+sector_ratio)*dict_company_pred[code])
                    self.ratios[code] = ratio
                else:
                    sector_idx = np.where(sector==self.sectors)[0][0]
                    ratio = np.exp(global_ratio*dict_global_corr[code]*forecasts_global 
                                          + sector_ratio*dict_sector_corr[code]*forecasts_sector[sector_idx] 
                                          + company_ratio*dict_company_pred[code])
                    self.ratios[code] = ratio
        
        if public:
            submission = self.public_submission
        else:
            submission = self.private_submission
            
        for code in self.codes:
            if code in self.ratios.keys():
                submission[code] = pivot.iloc[-1][code] * self.ratios[code]
            else:
                submission[code] = pivot.iloc[-1][code]
 
    def postprocessing(self):
        self.private_submission.loc[:,'017670'] /= 5
    
    def save(self):  
        self.submission.iloc[:5,1:] = self.public_submission.iloc[:,1:]
        self.submission.iloc[5:,1:] = self.private_submission.iloc[:,1:]
        
        self.submission = self.submission.fillna(0)
        self.submission.to_csv(self.final_submission_path, index=False)
    
    def save_figures(self):
        if not os.path.exists(self.figure_path):
            os.makedirs(self.figure_path)
            
        self.private_submission_copy = self.private_submission.copy()
        self.private_submission_copy['Day'] = pd.to_datetime(self.private_submission_copy['Day'], format='%Y-%m-%d')
        self.private_submission_copy = self.private_submission_copy.set_index('Day')
        
        for code in tqdm(self.codes):
            fig, ax = plt.subplots()
            self.private_submission_copy[code].plot(ax=ax)
            self.df_marcap.loc[self.df_marcap['Code'] == code].iloc[-100:,]['Close'].plot(ax=ax)
            ax.set_title(str(code) + ' ' + self.stock_list[self.stock_list['종목코드'] == code]['종목명'].values[0])
            plt.savefig(self.figure_path + '/' + str(code) + '.png')
        
    def run(self):
        self.load_file()
        self.load_data(start_date='2021-01-01', end_date='2021-11-26')
        self.set_dates(training_date='2021-10-29', start_date='2021-11-1', end_date='2021-11-5', public=True)
        self.train(public=True)
        self.set_dates(training_date='2021-11-26', start_date='2021-11-29', end_date='2021-12-3', public=False)
        self.train(public=False)
        # self.postprocessing()
        self.save()

In [5]:
predictor = StockPredictor()
predictor.run()

100%|██████████| 370/370 [00:43<00:00,  8.51it/s]


train globally...
train by sector...


  2%|▏         | 8/370 [00:00<00:04, 74.94it/s]

train by company...


100%|██████████| 370/370 [00:04<00:00, 87.22it/s] 


train globally...
train by sector...


  2%|▏         | 9/370 [00:00<00:04, 83.28it/s]

train by company...


100%|██████████| 370/370 [00:04<00:00, 91.67it/s] 
