In [2]:
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import warnings
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [3]:
def read_data():
    return pd.read_csv('../input/raw/ADP.csv')

def save_data(df, perc_):
    len_df = df.shape[0]
    train_perc = int(len_df * perc_)
    
    train_df = df[:train_perc]
    test_df = df[train_perc:]
    train_df.to_csv('../input/train.csv')
    test_df.to_csv('../input/test.csv')
    
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

data = read_data()
data = reduce_mem_usage(data)

Memory usage after optimization is: 1.24 MB
Decreased by 75.1%


In [4]:
# data['Date'] = pd.to_datetime(data['Date'])
df = data

In [None]:
# df['day'] = df['Date'].dt.day.astype("int16")
# df['month'] = df['Date'].dt.month.astype("int16")
# df['year'] = df['Date'].dt.year.astype("int16")
# df['weekday'] = df['Date'].dt.weekday.astype("int16")
# df['quarter'] = df['Date'].dt.quarter.astype("int16")
# # df['is_holiday'] = df['Date'].isin(holidays).astype("int16")

df['shifted_close'] = df['Close'].shift(periods=-1)
df.shifted_close.head()

In [None]:
data[['Close', 'shifted_close']].plot(figsize=(16,6))

In [None]:
data['Close'].plot(figsize=(16,6))

In [None]:
def calc_roll_stats(s, windows=[5, 30]):
    '''
    # https://www.kaggle.com/hirayukis/lightgbm-keras-and-4-kfold
    Calculates rolling stats like mean, std, min, max...
    '''
    roll_stats = pd.DataFrame()
    for w in windows:
        roll_stats['roll_mean_' + str(w)] = s.rolling(window=w, min_periods=1).mean()
        roll_stats['roll_std_' + str(w)] = s.rolling(window=w, min_periods=1).std()
        roll_stats['roll_min_' + str(w)] = s.rolling(window=w, min_periods=1).min()
        roll_stats['roll_max_' + str(w)] = s.rolling(window=w, min_periods=1).max()
        roll_stats['roll_range_' + str(w)] = roll_stats['roll_max_' + str(w)] - roll_stats['roll_min_' + str(w)]
        roll_stats['roll_mean_s_' + str(w)] = s.rolling(window=w, min_periods=1).mean().shift(-w)
        roll_stats['roll_std_s_' + str(w)] = s.rolling(window=w, min_periods=1).std().shift(-w)
        roll_stats['roll_min_s_' + str(w)] = s.rolling(window=w, min_periods=1).min().shift(-w)
        roll_stats['roll_max_s_' + str(w)] = s.rolling(window=w, min_periods=1).max().shift(-w)
        roll_stats['roll_range_s_' + str(w)] = roll_stats['roll_max_s_' + str(w)] - roll_stats['roll_min_s_' + str(w)]
        roll_stats['roll_min_abs_' + str(w)] = s.rolling(window=2*w, min_periods=1).min().abs().shift(-w)
        roll_stats['roll_range_sbs_' + str(w)] = roll_stats['roll_max_' + str(w)] - roll_stats['roll_min_abs_' + str(w)].shift(-w)
        roll_stats['roll_q10_' + str(w)] = s.rolling(window=2*w, min_periods=1).quantile(0.10).shift(-w)
        roll_stats['roll_q25_' + str(w)] = s.rolling(window=2*w, min_periods=1).quantile(0.25).shift(-w)
        roll_stats['roll_q50_' + str(w)] = s.rolling(window=2*w, min_periods=1).quantile(0.50).shift(-w)
        roll_stats['roll_q75_' + str(w)] = s.rolling(window=2*w, min_periods=1).quantile(0.75).shift(-w)
        roll_stats['roll_q90_' + str(w)] = s.rolling(window=2*w, min_periods=1).quantile(0.90).shift(-w)
        roll_stats['mean_abs_chg' + str(w)] = roll_stats.apply(lambda x: np.mean(np.abs(np.diff(x))))
    
    # add zeros when na values (std)
    roll_stats = roll_stats.fillna(value=0)
             
    return roll_stats

#########################################################################################################################

def calc_ewm(s, windows=[5]):
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm

In [None]:
ewms = calc_ewm(data['Close'])
rollstats = calc_roll_stats(data['Close'])

In [None]:
df = data.join(ewms)
df = data.join(rollstats)

In [None]:
df['roll_min_30'].plot(figsize=(16,6))
# df.head()

In [None]:
df['roll_q10_5'].plot(figsize=(16,6))

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

# check if today is a holiday based on US
cal = calendar()
holidays = cal.holidays(start=df['Date'].min(), end=df['Date'].max())

In [None]:
df.drop(['Date'], axis=1, inplace=True)

In [None]:
def return_class(x):
    if x<0:
        return 1 # fell
    elif x==0:
        return 2 # stagnant
    else:
        return 3 # rise

df['return_class'] = df['return'].apply(return_class)
df['return_class'].head()

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Currency,rsi,weekday,...,H-PC,L-PC,TR,ATR,Upper Basic,Lower Basic,Upper Band,Lower Band,SuperTrend,week_ratio_compare
0,0,2000-01-03,42.15625,42.5,40.9375,40.96875,2140400,USD,,3,...,,,1.570312,,,,,,,
1,1,2000-01-04,40.6875,41.75,40.59375,40.96875,2491200,USD,,3,...,0.779785,0.399902,1.179688,,,,,,,
2,2,2000-01-05,40.9375,40.9375,39.78125,40.59375,2386200,USD,,3,...,0.049988,1.19043,1.19043,,,,,,,
3,3,2000-01-06,40.875,41.71875,40.4375,41.125,1682700,USD,,3,...,1.129883,0.150024,1.280273,,,,,,,
4,4,2000-01-07,42.0625,42.40625,41.375,42.0625,1500800,USD,,3,...,1.280273,0.25,1.280273,,,,,,,


In [6]:
save_data(df, 0.85)

### Feature Engineering

1. shift price features

In [None]:
# colscorr = ['Open', 'High', 'Low', 'Close', 'Volume', 'rsi', 'weekday', 'middle_bb',
#         'lower_bb', 'upper_bb', 'macd', 'pos_directional_indicator', 'neg_directional_indicator',
#         'adx', 'cci', 'aroon_down', 'aroon_up', 'aroon_indicator', 'day', 'month', 'year',
#         'quarter', 'is_holiday', 'return']

cor = df.corr()
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()

In [None]:
#Correlation with output variable
# cor_target = abs(cor["return_class"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1]
relevant_features

**Return class of price the following day, 1 - fell, 2-stagnant, 3-rise**

In [None]:
data['Close'].diff(periods=1) > 0

In [None]:
def get_feature_trend(ft):
    tmp_df = list(data[ft].diff(periods=1))
    ans = []
    for x in tmp_df:
        if x < 0:
            ans.append(1)
        elif x == 0:
            ans.append(2)
        else:
            ans.append(3)
    return ans

In [None]:
df['Close_trend'] = get_feature_trend('Close')
df['Close_trend']

In [8]:
import numpy as np
import pandas as pd

dt = pd.read_csv('../output/v1_test_inference_perc_check.csv')
dt.head()

Unnamed: 0,SVM,LGB,close_tomorrow,close,SVM %change,LGB %change,close_tomorrow %change
0,100.529609,102.590493,101.6,102.5,-1.922333,0.088286,-0.878049
1,100.569757,101.705902,101.4,101.6,-1.014019,0.104234,-0.19685
2,100.516347,101.508096,101.7,101.4,-0.871452,0.106604,0.295858
3,100.540744,101.897125,101.3,101.7,-1.139878,0.19383,-0.393314
4,100.48096,101.4588,102.75,101.3,-0.808529,0.156762,1.431392


In [11]:
dt.loc[(dt['SVM %change'] > 2) & (dt['LGB %change'] > 1.5)]

Unnamed: 0,SVM,LGB,close_tomorrow,close,SVM %change,LGB %change,close_tomorrow %change


In [37]:
def calc_perc_change(arg):
    x = arg[0]
    y = arg[1]
    ans = ((x - y) / y) * 100
    return ans


for i in ['SVM', 'LGB', 'close_tomorrow']:
    dt[f'{i} %change'] = dt[[i, 'close']].apply(calc_perc_change, axis=1)

dt.head()

Unnamed: 0,SVM,LGB,close_tomorrow,close,SVM %change,LGB %change,close_tomorrow %change
0,15.812698,15.584647,15.19,15.22,3.894205,2.395843,-0.197109
1,15.672398,15.467332,15.72,15.19,3.175758,1.825753,3.489138
2,16.313847,15.585563,15.89,15.72,3.777651,-0.855199,1.081425
3,16.413078,16.005225,15.78,15.89,3.29187,0.725139,-0.692259
4,16.391458,15.97452,15.55,15.78,3.874893,1.232702,-1.457541


In [None]:
"""
My logic now is to find out where svm % change and lgb % change match the dirks' condition
"""