This notebook contains the logic of creating the feature set file - 'scaled_feature_set.csv'
This file has all the features in normalised form while the target column is in raw form. 

In [1]:
import technical_indicators as ti
import pandas as pd
import numpy as np
import lightgbm as lgb
from IPython.display import set_matplotlib_formats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline

In [2]:
data = pd.read_csv("./AXISBANK.NS.csv")

In [3]:
data = pd.DataFrame(data.convert_objects(convert_numeric='float'))

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [49]:
n1 = 30
n2 = 40
n3 = 50

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232 entries, 0 to 1231
Data columns (total 7 columns):
Date         1232 non-null object
Open         1232 non-null float64
High         1232 non-null float64
Low          1232 non-null float64
Close        1232 non-null float64
Adj Close    1232 non-null float64
Volume       1232 non-null float64
dtypes: float64(6), object(1)
memory usage: 67.4+ KB


In [51]:
data.shape

(1232, 7)

In [52]:
data.ffill(inplace=True)

In [53]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0
3,2013-04-09,242.070007,247.960007,239.759995,240.639999,206.335022,7490495.0
4,2013-04-10,242.960007,248.570007,240.0,247.860001,212.525757,10046765.0


In [54]:
dataset_indices = pd.to_datetime(data.Date, format='%Y-%m-%d')

In [55]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0
3,2013-04-09,242.070007,247.960007,239.759995,240.639999,206.335022,7490495.0
4,2013-04-10,242.960007,248.570007,240.0,247.860001,212.525757,10046765.0


In [56]:
df = data
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-04-04,253.199997,254.000000,248.020004,248.770004,213.306046,6417705.0
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0
3,2013-04-09,242.070007,247.960007,239.759995,240.639999,206.335022,7490495.0
4,2013-04-10,242.960007,248.570007,240.000000,247.860001,212.525757,10046765.0
5,2013-04-11,249.309998,250.979996,246.199997,250.429993,214.729401,9714180.0
6,2013-04-12,248.000000,255.399994,246.899994,252.860001,216.812988,7911225.0
7,2013-04-15,251.199997,262.269989,250.800003,258.839996,221.940491,7689700.0
8,2013-04-16,257.299988,273.799988,256.549988,272.660004,233.790359,11987860.0
9,2013-04-17,274.359985,290.000000,273.230011,275.440002,236.174042,9186545.0


#### Appending calculated Technical indicators columns

In [57]:
df11 = ti.moving_average(df,n1)
df12 = ti.moving_average(df11,n2)
df13 = ti.moving_average(df12,n3)

In [58]:
df13.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,


In [59]:
df21 = ti.exponential_moving_average(df13,n1)
df22 = ti.exponential_moving_average(df21,n2)
df23 = ti.exponential_moving_average(df22,n3)

In [60]:
df23.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,EMA_40,EMA_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,,,


In [61]:
df3 = ti.macd(df23,12,26)

In [62]:
df3.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,EMA_40,EMA_50,MACD_12_26,MACDsign_12_26,MACDdiff_12_26
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,,,,,,
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0,,,,,,,,,
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0,,,,,,,,,
3,2013-04-09,242.070007,247.960007,239.759995,240.639999,206.335022,7490495.0,,,,,,,,,
4,2013-04-10,242.960007,248.570007,240.0,247.860001,212.525757,10046765.0,,,,,,,,,


In [63]:
df41 = ti.relative_strength_index(df3,n1)
df42 = ti.relative_strength_index(df41,n2)
df43 = ti.relative_strength_index(df42,n3)

In [64]:
df43.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,EMA_40,EMA_50,MACD_12_26,MACDsign_12_26,MACDdiff_12_26,RSI_30,RSI_40,RSI_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,,,,,,,,,


In [65]:
df51 = ti.bollinger_bands(df43,n1)
df52 = ti.bollinger_bands(df51,n2)
df53 = ti.bollinger_bands(df52,n3)

In [66]:
df53.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,MACDdiff_12_26,RSI_30,RSI_40,RSI_50,BollingerB_30,Bollinger%b_30,BollingerB_40,Bollinger%b_40,BollingerB_50,Bollinger%b_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,


In [67]:
df61 = ti.stochastic_oscillator_d(df53,n1)
df62 = ti.stochastic_oscillator_d(df61,n2)
df63 = ti.stochastic_oscillator_d(df62,n3)

In [68]:
df63.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,RSI_50,BollingerB_30,Bollinger%b_30,BollingerB_40,Bollinger%b_40,BollingerB_50,Bollinger%b_50,SO%d_30,SO%d_40,SO%d_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,


In [69]:
df7 = ti.stochastic_tor_k(df63)

In [70]:
df81 = ti.momentum(df7,n1)
df82 = ti.momentum(df81,n2)
df83 = ti.momentum(df82,n3)

In [71]:
df83.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,Bollinger%b_40,BollingerB_50,Bollinger%b_50,SO%d_30,SO%d_40,SO%d_50,SO%k,Momentum_30,Momentum_40,Momentum_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,0.125418,,,


In [72]:
df91 = ti.commodity_channel_index(df83,n1)
df92 = ti.commodity_channel_index(df91,n2)
df93 = ti.commodity_channel_index(df92,n3)

In [73]:
df93.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,SO%d_30,SO%d_40,SO%d_50,SO%k,Momentum_30,Momentum_40,Momentum_50,CCI_30,CCI_40,CCI_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,0.125418,,,,,,


In [74]:
df10 = ti.chaikin_oscillator(df93)

In [75]:
df10.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,SO%d_40,SO%d_50,SO%k,Momentum_30,Momentum_40,Momentum_50,CCI_30,CCI_40,CCI_50,Chaikin
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,0.125418,,,,,,,


In [76]:
df101 = ti.donchian_channel(df10,n1)
df102 = ti.donchian_channel(df101,n2)
df103 = ti.donchian_channel(df102,n3)

In [77]:
df103.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,Momentum_30,Momentum_40,Momentum_50,CCI_30,CCI_40,CCI_50,Chaikin,Donchian_30,Donchian_40,Donchian_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0,,,,...,,,,,,,,,,
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0,,,,...,,,,,,,,,,
3,2013-04-09,242.070007,247.960007,239.759995,240.639999,206.335022,7490495.0,,,,...,,,,,,,,,,
4,2013-04-10,242.960007,248.570007,240.0,247.860001,212.525757,10046765.0,,,,...,,,,,,,,,,


In [78]:
df111 = ti.rate_of_change(df103, n1)
df112 = ti.rate_of_change(df111, n2)
df113 = ti.rate_of_change(df112, n3)

In [79]:
df113.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,CCI_30,CCI_40,CCI_50,Chaikin,Donchian_30,Donchian_40,Donchian_50,ROC_30,ROC_40,ROC_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,


In [80]:
df113 = ti.ultimate_oscillator(df113)

In [81]:
df121 = ti.williamsR(df113,n1)
df122 = ti.williamsR(df121,n2)
df123 = ti.williamsR(df122,n3)

In [82]:
df123.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,Donchian_30,Donchian_40,Donchian_50,ROC_30,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,


In [83]:
df131 = ti.aroon_oscillator(df123,n1)
df132 = ti.aroon_oscillator(df131,n2)
df133 = ti.aroon_oscillator(df132,n3)

In [84]:
df133.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_30,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,


In [85]:
df133.shape

(1232, 49)

In [86]:
df133[48:].head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_30,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50
48,2013-06-14,256.839996,262.079987,256.359985,259.600006,222.592148,9145290.0,287.645667,288.94025,,...,-0.110807,-0.057508,,2.781913,-86.119645,-86.119645,,-60.0,-45.0,
49,2013-06-17,260.420013,264.209991,253.809998,263.209991,225.687515,9148855.0,286.687666,288.6345,281.5886,...,-0.124938,-0.079299,0.058046,2.889545,-79.948732,-79.948732,-66.453562,-60.0,-45.0,
50,2013-06-18,262.350006,262.940002,256.799988,257.320007,220.637161,7215030.0,285.238666,287.9205,281.7596,...,-0.136336,-0.107241,0.046399,2.854352,-90.017078,-90.017078,-74.822393,-60.0,-45.0,54.0
51,2013-06-19,257.390015,257.899994,253.399994,257.01001,220.371384,7633795.0,283.874333,287.14,281.9816,...,-0.116288,-0.110569,0.068027,3.158435,-90.546987,-90.546987,-75.262855,-60.0,-45.0,54.0
52,2013-06-20,252.800003,253.199997,246.399994,247.960007,212.611511,9339930.0,282.445334,286.115,282.128,...,-0.156541,-0.175336,0.030419,2.989314,-97.546378,-97.546378,-88.322401,-76.666667,-57.5,54.0


#### Computing percentage change/ percentage return price for target column

In [87]:
df133['PCT_price'] = df133['Close'].pct_change(periods=30)

In [88]:
df133.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
0,2013-04-04,253.199997,254.0,248.020004,248.770004,213.306046,6417705.0,,,,...,,,,,,,,,,
1,2013-04-05,248.089996,249.869995,243.440002,245.910004,210.853775,6484050.0,,,,...,,,,,,,,,,
2,2013-04-08,245.300003,247.600006,239.600006,240.639999,206.335022,4864420.0,,,,...,,,,,,,,,,


In [89]:
df_raw_features = df133[49:]                             # avoid NaN rows

Stocks grow at a compounded rate and hence follow lognormal distribution.
Lognormal distribution is extremely useful for stock prices as long as the growth factor is assumed to be normally distributed. 



In [90]:
column_names = df_raw_features.columns

In [91]:
df_raw_features.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
49,2013-06-17,260.420013,264.209991,253.809998,263.209991,225.687515,9148855.0,286.687666,288.6345,281.5886,...,-0.079299,0.058046,2.889545,-79.948732,-79.948732,-66.453562,-60.0,-45.0,,-0.098442
50,2013-06-18,262.350006,262.940002,256.799988,257.320007,220.637161,7215030.0,285.238666,287.9205,281.7596,...,-0.107241,0.046399,2.854352,-90.017078,-90.017078,-74.822393,-60.0,-45.0,54.0,-0.144519
51,2013-06-19,257.390015,257.899994,253.399994,257.01001,220.371384,7633795.0,283.874333,287.14,281.9816,...,-0.110569,0.068027,3.158435,-90.546987,-90.546987,-75.262855,-60.0,-45.0,54.0,-0.137377
52,2013-06-20,252.800003,253.199997,246.399994,247.960007,212.611511,9339930.0,282.445334,286.115,282.128,...,-0.175336,0.030419,2.989314,-97.546378,-97.546378,-88.322401,-76.666667,-57.5,54.0,-0.147406
53,2013-06-21,245.809998,253.889999,244.789993,251.940002,216.024139,11567800.0,281.044,284.8965,282.354,...,-0.152488,0.016461,3.481562,-89.032049,-89.032049,-82.937982,-80.0,-60.0,52.0,-0.143003


In [92]:
df_raw_features.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
1227,2018-03-27,512.299988,522.549988,509.049988,512.200012,512.200012,6423140.0,527.780002,539.9825,549.881999,...,-0.154157,-0.077283,3.200959,-76.648348,-86.475737,-87.160117,-93.333333,-95.0,-82.0,-0.081338
1228,2018-03-28,512.900024,512.900024,501.600006,510.5,510.5,10980960.0,525.990001,537.606251,548.989999,...,-0.140211,-0.089287,3.411686,-78.343955,-86.672483,-88.444115,-93.333333,-95.0,-82.0,-0.095179
1229,2018-04-02,510.5,510.5,492.200012,499.049988,499.049988,7120498.0,524.495,535.23875,547.759999,...,-0.159282,-0.113903,3.217038,-89.057542,-94.185079,-94.940932,-100.0,-95.0,-88.0,-0.08246
1230,2018-04-03,494.0,506.0,493.700012,501.549988,501.549988,6053789.0,523.089999,532.937501,546.526999,...,-0.153931,-0.142943,3.260615,-84.967887,-92.062838,-93.094551,-96.666667,-95.0,-88.0,-0.077524
1231,2018-04-04,504.5,504.5,490.25,492.25,492.25,6047813.0,521.57,530.423751,544.667999,...,-0.128607,-0.158259,3.085564,-96.882308,-97.949769,-98.543866,-80.0,-100.0,-92.0,-0.084782


Calculating technical indicator features from lagging features result in zero-values in the initial rows till the specified lagged period. Replacing infinite and zero values with NaN.

In [93]:
df_to_fillna = df_raw_features.drop('Date',axis=1)
df_to_fillna.replace([np.inf, -np.inf,0], np.nan,inplace=True)

In [94]:
df_to_fillna.head(1)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
49,260.420013,264.209991,253.809998,263.209991,225.687515,9148855.0,286.687666,288.6345,281.5886,279.149545,...,-0.079299,0.058046,2.889545,-79.948732,-79.948732,-66.453562,-60.0,-45.0,,-0.098442


Filling NaN values with the column average value.

In [95]:
df_to_normalize = df_to_fillna.fillna(df_to_fillna.mean()) 

In [96]:
df_to_normalize.head(2)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
49,260.420013,264.209991,253.809998,263.209991,225.687515,9148855.0,286.687666,288.6345,281.5886,279.149545,...,-0.079299,0.058046,2.889545,-79.948732,-79.948732,-66.453562,-60.0,-45.0,15.79357,-0.098442
50,262.350006,262.940002,256.799988,257.320007,220.637161,7215030.0,285.238666,287.9205,281.7596,277.692626,...,-0.107241,0.046399,2.854352,-90.017078,-90.017078,-74.822393,-60.0,-45.0,54.0,-0.144519


In [97]:
df_to_normalize.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
count,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,...,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0,1183.0
mean,449.128183,454.966578,442.921732,448.808014,438.766974,8992257.0,445.933125,444.926688,443.806229,445.832184,...,0.028487,0.037989,3.458178,-45.009572,-44.287175,-43.946529,10.814314,13.901099,15.79357,0.021391
std,115.296031,116.183285,114.007365,115.007147,119.532178,7965878.0,114.770176,114.581246,114.346329,113.954563,...,0.1308,0.145678,0.613085,30.095548,30.781354,30.991041,62.324195,63.383578,63.169297,0.111731
min,156.559998,163.399994,152.679993,156.570007,144.533203,1238680.0,195.272666,200.37075,204.0462,198.085098,...,-0.374995,-0.370218,1.534229,-99.021402,-99.17609,-99.17609,-100.0,-100.0,-100.0,-0.359806
25%,389.300003,395.425003,383.845001,390.074997,379.239059,4973424.0,390.293503,387.906002,385.049601,388.573434,...,-0.057982,-0.055036,3.00288,-71.716268,-71.459744,-72.150772,-43.333333,-47.5,-52.0,-0.04925
50%,489.700012,495.700012,483.5,488.5,481.719971,7003729.0,492.994999,491.899998,489.478002,492.156369,...,0.028776,0.030649,3.464519,-41.827662,-39.652874,-39.62963,26.666667,37.5,34.0,0.021837
75%,534.0,539.400024,525.524994,533.275024,528.763733,10407170.0,533.168335,531.754375,529.986502,533.150345,...,0.107033,0.135459,3.903337,-17.399931,-15.263862,-14.648627,66.666667,67.5,68.0,0.086853
max,650.900024,654.900024,628.700012,647.549988,630.240906,120541900.0,593.855001,583.685001,577.908999,590.544317,...,0.561857,0.456026,5.007196,-0.806018,-0.676061,-0.65678,100.0,100.0,100.0,0.476911


scaling the features to bring the values in the range (0,1)

In [98]:
target_price = df_to_normalize['PCT_price'][30:]

In [99]:
training_set = df_to_normalize.drop('PCT_price',axis=1)

In [100]:
target_price = target_price.reset_index()

In [101]:
target_price.index

RangeIndex(start=0, stop=1153, step=1)

In [102]:
training_set.reset_index(inplace=True)

In [103]:
final_raw_features = pd.concat([training_set,target_price],axis=1)

In [104]:
final_raw_features.drop('index',axis=1,inplace=True)

In [105]:
final_raw_features

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
0,260.420013,264.209991,253.809998,263.209991,225.687515,9148855.0,286.687666,288.634500,281.588600,279.149545,...,-0.079299,0.058046,2.889545,-79.948732,-79.948732,-66.453562,-60.000000,-45.0,15.79357,-0.165495
1,262.350006,262.940002,256.799988,257.320007,220.637161,7215030.0,285.238666,287.920500,281.759600,277.692626,...,-0.107241,0.046399,2.854352,-90.017078,-90.017078,-74.822393,-60.000000,-45.0,54.00000,-0.150474
2,257.390015,257.899994,253.399994,257.010010,220.371384,7633795.0,283.874333,287.140000,281.981600,276.315318,...,-0.110569,0.068027,3.158435,-90.546987,-90.546987,-75.262855,-60.000000,-45.0,54.00000,-0.194934
3,252.800003,253.199997,246.399994,247.960007,212.611511,9339930.0,282.445334,286.115000,282.128000,274.430979,...,-0.175336,0.030419,2.989314,-97.546378,-97.546378,-88.322401,-76.666667,-57.5,54.00000,-0.124214
4,245.809998,253.889999,244.789993,251.940002,216.024139,11567800.0,281.044000,284.896500,282.354000,272.939243,...,-0.152488,0.016461,3.481562,-89.032049,-89.032049,-82.937982,-80.000000,-60.0,52.00000,-0.124236
5,249.860001,251.759995,246.869995,248.610001,213.168839,8864770.0,279.651000,283.680001,282.369000,271.328497,...,-0.157483,-0.007267,3.515964,-94.140195,-94.140195,-94.140195,-80.000000,-60.0,50.00000,-0.095491
6,250.199997,250.800003,241.580002,246.289993,211.179581,17111600.0,278.077334,282.460251,282.286200,269.673591,...,-0.175018,-0.025983,3.400959,-93.114049,-93.114049,-93.114049,-86.666667,-65.0,-52.00000,-0.114986
7,247.220001,252.839996,247.220001,249.679993,214.086288,13302520.0,276.342000,281.238750,282.222600,268.354207,...,-0.176517,-0.035389,3.010592,-88.157909,-88.157909,-88.157909,-86.666667,-65.0,-52.00000,-0.143904
8,252.800003,257.000000,252.740005,255.009995,218.656479,11566105.0,274.636334,280.034000,282.146000,267.474915,...,-0.135442,-0.064733,3.448100,-80.365510,-80.365510,-80.365510,-86.666667,-65.0,-52.00000,-0.148582
9,258.399994,266.399994,257.500000,265.040009,227.256653,7250955.0,273.236001,279.286000,281.993600,267.314692,...,-0.092173,-0.037758,3.790214,-65.701749,-65.701749,-65.701749,-86.666667,-65.0,-52.00000,-0.192235


In [106]:
final_raw_features.columns

Index([u'Open', u'High', u'Low', u'Close', u'Adj Close', u'Volume', u'MA_30',
       u'MA_40', u'MA_50', u'EMA_30', u'EMA_40', u'EMA_50', u'MACD_12_26',
       u'MACDsign_12_26', u'MACDdiff_12_26', u'RSI_30', u'RSI_40', u'RSI_50',
       u'BollingerB_30', u'Bollinger%b_30', u'BollingerB_40',
       u'Bollinger%b_40', u'BollingerB_50', u'Bollinger%b_50', u'SO%d_30',
       u'SO%d_40', u'SO%d_50', u'SO%k', u'Momentum_30', u'Momentum_40',
       u'Momentum_50', u'CCI_30', u'CCI_40', u'CCI_50', u'Chaikin',
       u'Donchian_30', u'Donchian_40', u'Donchian_50', u'ROC_30', u'ROC_40',
       u'ROC_50', u'Ultimate_Osc', u'WilliamsR_30', u'WilliamsR_40',
       u'WilliamsR_50', u'AO_30', u'AO_40', u'AO_50', u'PCT_price'],
      dtype='object')

In [107]:
min_max_scaler = MinMaxScaler()

In [108]:
scaled_inp_features = min_max_scaler.fit_transform(final_raw_features.drop('PCT_price',axis=1))

In [109]:
combined_df = pd.DataFrame(scaled_inp_features).join(final_raw_features['PCT_price'])

In [110]:
combined_df.columns = final_raw_features.columns

In [111]:
combined_df1 = df_raw_features.Date.reset_index().join(combined_df).drop('index',axis=1)

In [112]:
combined_df1.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
0,2013-06-17,0.210098,0.205107,0.212449,0.217198,0.167085,0.066303,0.22935,0.230265,0.207409,...,0.315627,0.518326,0.390247,0.194192,0.195202,0.332143,0.2,0.275,0.578968,-0.165495
1,2013-06-18,0.214003,0.202523,0.21873,0.205202,0.156687,0.050094,0.225715,0.228402,0.207866,...,0.285802,0.50423,0.380114,0.091679,0.092985,0.247197,0.2,0.275,0.77,-0.150474
2,2013-06-19,0.203969,0.192269,0.211588,0.20457,0.15614,0.053604,0.222292,0.226366,0.20846,...,0.28225,0.530406,0.467671,0.086284,0.087605,0.242726,0.2,0.275,0.77,-0.194934
3,2013-06-20,0.194684,0.182706,0.196882,0.186138,0.140163,0.067905,0.218707,0.223692,0.208851,...,0.213117,0.48489,0.418975,0.015018,0.016545,0.110168,0.116667,0.2125,0.77,-0.124214
4,2013-06-21,0.180544,0.18411,0.1935,0.194244,0.147189,0.086579,0.215191,0.220513,0.209456,...,0.237505,0.467996,0.560711,0.101709,0.102985,0.164822,0.1,0.2,0.76,-0.124236


In [113]:
combined_df1.to_csv('scaled_feature_set_AXISBANK.csv',index=False)

In [117]:
feature_set = pd.read_csv("scaled_feature_set_AXISBANK.csv",parse_dates=True)
feature_set.index = pd.to_datetime(feature_set.Date, format='%Y-%m-%d')
feature_set.drop('Date',axis=1,inplace=True)

In [118]:
feature_set.shape

(1183, 49)

In [119]:
feature_set[1152:].head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50,PCT_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-16,0.789922,0.783723,0.80316,0.776569,0.809781,0.022395,0.950671,0.963561,0.963166,0.939784,...,0.36822,0.464842,0.330482,0.058796,0.060196,0.081805,0.25,0.3125,0.82,-0.084782
2018-02-19,0.771311,0.776297,0.787404,0.782069,0.81534,0.025255,0.949053,0.962651,0.963703,0.93545,...,0.383938,0.462804,0.447403,0.122775,0.12399,0.123966,0.216667,0.2875,0.33,
2018-02-20,0.779706,0.782604,0.791395,0.764043,0.797119,0.018076,0.946414,0.961526,0.963578,0.929941,...,0.358046,0.423979,0.378158,0.032757,0.034233,0.034226,0.216667,0.2875,0.33,
2018-02-21,0.763523,0.763174,0.780892,0.772394,0.80556,0.028496,0.943935,0.960365,0.963219,0.925462,...,0.363345,0.427425,0.43984,0.102508,0.103782,0.103762,0.183333,0.2625,0.31,
2018-02-22,0.7615,0.75351,0.787404,0.76109,0.794134,0.055067,0.941003,0.958751,0.962425,0.920359,...,0.353406,0.42931,0.416433,0.047752,0.049184,0.049175,0.183333,0.2625,0.31,


In [120]:
x_train = feature_set.drop('PCT_price',axis=1)[:1153]
x_test = feature_set['2018-04-04':].drop('PCT_price',axis=1)
x_test

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MA_30,MA_40,MA_50,EMA_30,...,ROC_30,ROC_40,ROC_50,Ultimate_Osc,WilliamsR_30,WilliamsR_40,WilliamsR_50,AO_30,AO_40,AO_50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-04,0.703848,0.693998,0.709151,0.683694,0.715897,0.04031,0.818645,0.861051,0.911088,0.826419,...,0.326902,0.262996,0.256533,0.446689,0.02178,0.01245,0.006417,0.1,0.0,0.04


In [121]:
min_max_scaler = MinMaxScaler()                                        
min_max_scaler.fit(feature_set[['PCT_price']][:1153])                  # fitting the scaler to the target of training data

MinMaxScaler(copy=True, feature_range=(0, 1))

In [122]:
y = min_max_scaler.transform(feature_set[['PCT_price']][:1153])        # transforming target of training data
y_train = pd.Series(y.reshape(y.shape[0],))
y_train.head()

0    0.232230
1    0.250182
2    0.197046
3    0.281567
4    0.281541
dtype: float64

Using LightGBM and treating this as a regression problem.

In [123]:
Train_data = lgb.Dataset(x_train,label=y_train)

In [124]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'max_depth':5,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

num_round=60

#### Performing Bagging

In [125]:
PredictionsList = []

for i in np.arange(2,22):
    params['feature_fraction_seed'] = i
    %time lgbm=lgb.train(params,Train_data,num_round,verbose_eval=False)

    lgb_predictions = lgbm.predict(x_test)
    p_len = len(lgb_predictions)
    predictions = min_max_scaler.inverse_transform(lgb_predictions.reshape(len(lgb_predictions),1)).reshape(p_len,)
    

    PredictionsList.append(predictions)

Wall time: 62 ms
Wall time: 50 ms
Wall time: 32 ms
Wall time: 16 ms
Wall time: 34 ms
Wall time: 51 ms
Wall time: 67 ms
Wall time: 31 ms
Wall time: 32 ms
Wall time: 32 ms
Wall time: 31 ms
Wall time: 31 ms
Wall time: 15 ms
Wall time: 29 ms
Wall time: 16 ms
Wall time: 31 ms
Wall time: 32 ms
Wall time: 31 ms
Wall time: 31 ms
Wall time: 41 ms


In [126]:
Predictions_agg = [sum(n)/len(n) for n in zip(*PredictionsList)];
aggr_series  = pd.Series(Predictions_agg,name='Predicted Close',index=x_test.index)

In [127]:
aggr_series

Date
2018-04-04   -0.004086
Name: Predicted Close, dtype: float64