In [8]:
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from talib import RSI, BBANDS,WILLR,WMA,SMA,EMA,TEMA,KAMA,CCI,CMO,MACD,PPO,ROC,APO,DX,MOM,STOCH,AROON,ADX,OBV
import matplotlib.pyplot as plt
import requests
import lxml
import datetime as dt
import os


## retrieve stock tickers data

### Cyclic tickers

In [14]:
#use saved tickers in txt file to get the data of STI stocks from yahoo

def get_data_from_yahoo():
    
    with open('cyclic_tickers.txt') as f:
        tickers = [i.strip() for i in f.readlines()]
        
    if not os.path.exists('data/cyclic_tickers/initial'):
        os.makedirs('data/cyclic_tickers/initial')
        
    start = dt.datetime(2000,1,1)
    end = dt.datetime(2020,1,1)
    
    for ticker in tickers:
        print(ticker)
        if not os.path.exists('data/cyclic_tickers/initial/{}.csv'.format(ticker)):
            df = web.DataReader(ticker,'yahoo',start,end)
            df.to_csv('data/cyclic_tickers/initial/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))
            
get_data_from_yahoo()

C6L.SI
Already have C6L.SI
JPM
Already have JPM
BA
Already have BA
O39.SI
Already have O39.SI
600104.SS
0883.HK
1398.HK


### Non-Cyclic tickers

In [9]:
#use saved tickers in txt file to get the data of STI stocks from yahoo

def get_data_from_yahoo():
    
    with open('noncyclic_tickers.txt') as f:
        tickers = [i.strip() for i in f.readlines()]
        
    if not os.path.exists('data/noncyclic_tickers/initial'):
        os.makedirs('data/noncyclic_tickers/initial')
        
    start = dt.datetime(2000,1,1)
    end = dt.datetime(2020,1,1)
    
    for ticker in tickers:
        print(ticker)
        if not os.path.exists('data/noncyclic_tickers/initial/{}.csv'.format(ticker)):
            df = web.DataReader(ticker,'yahoo',start,end)
            df.to_csv('data/noncyclic_tickers/initial/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))
            
get_data_from_yahoo()

600519.SS
Already have 600519.SS
COST
Already have COST
PG
Already have PG
000858.SZ
Already have 000858.SZ
WMT
Already have WMT
INO
Already have INO
NVAX
Already have NVAX
2319.HK
Already have 2319.HK
000333.SZ
Already have 000333.SZ


## Function to create labels

In [16]:
import os
import re
from operator import itemgetter

import pandas as pd
import pickle
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.utils import compute_class_weight
from tqdm.auto import tqdm

def create_labels(df, col_name, window_size=15):
        """
        Data is labeled as per the logic in research paper
        Label code : BUY => 1, SELL => 0, HOLD => 2
        params :
            df => Dataframe with data
            col_name => name of column which should be used to determine strategy
        returns : numpy array with integer codes for labels with
                  size = total-(window_size)+1
        """
        row_counter = 0
        total_rows = len(df)
        labels = np.zeros(total_rows)
        labels[:] = np.nan
        print("Calculating labels")
        pbar = tqdm(total=total_rows)

        while row_counter < total_rows:
            if row_counter >= window_size - 1:
                window_begin = row_counter - (window_size - 1)
                window_end = row_counter
                window_middle = (window_begin + window_end) / 2
                window_middle = int(window_middle)

                min_ = np.inf
                min_index = -1
                max_ = -np.inf
                max_index = -1
                for i in range(window_begin, window_end + 1):
                    price = df.iloc[i][col_name]
                    if price < min_:
                        min_ = price
                        min_index = i
                    if price > max_:
                        max_ = price
                        max_index = i
                        
                end_price = df.iloc[window_end][col_name]
                mid_price = df.iloc[window_middle][col_name]

                if max_index == window_middle:
                    labels[window_middle] = 0
                elif min_index == window_middle:
                    labels[window_middle] = 1
                else:
                    labels[window_middle] = 2

            row_counter = row_counter + 1
            pbar.update(1)

        pbar.close()
        return labels

## Generate technical indicators

### Cyclic indicators

In [17]:
def compile_data():
    with open('cyclic_tickers.txt') as f:
        tickers = [i.strip() for i in f.readlines()]
    
    for count,ticker in enumerate(tickers):
        df = pd.read_csv('data/cyclic_tickers/initial/{}.csv'.format(ticker))
        df  = df.dropna()
            
        high = df['High'].shift(1).values
        low = df['Low'].shift(1).values
        close = df['Close'].shift(1).values
        volume = df['Volume'].shift(1).values

        df.drop(['High','Low','Open','Volume','Adj Close'], 1, inplace=True)
        print(df.head())
            
        df['previous_1d'] = df['Close'].shift(1)


        ema = EMA(close, timeperiod=7)
        arr = np.array(ema)
        df['EMA_7'] = arr


        willr = WILLR(high, low, close, timeperiod=7)
        arr = np.array(willr)
        df['willr_7'] = arr
            
        obv = OBV(close, volume)
        arr = np.array(obv)
        df['OBV_7'] = arr

        df.set_index('Date', inplace=True)
        
        print(df.head())
        print(df.info())
        
        labels = create_labels(df, 'Close', 15)
        df['labels'] = labels
        df.drop(['Close'], 1, inplace=True)
        df  = df.dropna()
        
        print(df.head())
        df.to_csv('data/cyclic_tickers/initial_indicators/{}_data.csv'.format(ticker))
    
compile_data()

         Date      Close
0  2000-01-03  20.000000
1  2000-01-04  20.600000
2  2000-01-05  19.200001
3  2000-01-06  18.299999
4  2000-01-07  19.299999
                Close  previous_1d  EMA_7  willr_7      OBV_7
Date                                                         
2000-01-03  20.000000          NaN    NaN      NaN        NaN
2000-01-04  20.600000    20.000000    NaN      NaN  1828000.0
2000-01-05  19.200001    20.600000    NaN      NaN  5264000.0
2000-01-06  18.299999    19.200001    NaN      NaN  2367000.0
2000-01-07  19.299999    18.299999    NaN      NaN -1617000.0
<class 'pandas.core.frame.DataFrame'>
Index: 5012 entries, 2000-01-03 to 2019-12-31
Data columns (total 5 columns):
Close          5012 non-null float64
previous_1d    5011 non-null float64
EMA_7          5005 non-null float64
willr_7        5005 non-null float64
OBV_7          5011 non-null float64
dtypes: float64(5)
memory usage: 234.9+ KB
None
Calculating labels


HBox(children=(FloatProgress(value=0.0, max=5012.0), HTML(value='')))


            previous_1d      EMA_7    willr_7      OBV_7  labels
Date                                                            
2000-01-12    18.900000  19.414286 -39.583332  2301000.0     2.0
2000-01-13    19.700001  19.485714 -22.916639  5756000.0     2.0
2000-01-14    19.600000  19.514286 -12.195121  4109000.0     2.0
2000-01-17    19.600000  19.535715 -20.833337  4109000.0     2.0
2000-01-18    19.000000  19.401786 -64.705876  2972000.0     2.0
         Date      Close
0  1999-12-31  51.791668
1  2000-01-03  48.583332
2  2000-01-04  47.250000
3  2000-01-05  46.958332
4  2000-01-06  47.625000
                Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                          
1999-12-31  51.791668          NaN    NaN      NaN         NaN
2000-01-03  48.583332    51.791668    NaN      NaN   1160400.0
2000-01-04  47.250000    48.583332    NaN      NaN -10858800.0
2000-01-05  46.958332    47.250000    NaN      NaN -22582200.0
2000-01-06  47.6

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7       OBV_7  labels
Date                                                             
2000-01-11    47.666668  48.339286 -73.333313 -21079050.0     1.0
2000-01-12    46.541668  47.889881 -78.070149 -29484600.0     2.0
2000-01-13    46.833332  47.625744 -61.904798 -22212750.0     2.0
2000-01-14    47.541668  47.604725 -41.666630 -15293850.0     2.0
2000-01-18    49.250000  48.016044 -25.000000  -5562000.0     2.0
         Date    Close
0  1999-12-31  41.4375
1  2000-01-03  40.1875
2  2000-01-04  40.1250
3  2000-01-05  42.6250
4  2000-01-06  43.0625
              Close  previous_1d  EMA_7  willr_7      OBV_7
Date                                                       
1999-12-31  41.4375          NaN    NaN      NaN        NaN
2000-01-03  40.1875      41.4375    NaN      NaN  1111200.0
2000-01-04  40.1250      40.1875    NaN      NaN -1527000.0
2000-01-05  42.6250      40.1250    NaN      NaN -5119100.0
2000-01-06  43.0625      42.6250    NaN

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7       OBV_7  labels
Date                                                             
2000-01-11      43.6875  42.205357 -23.170732  11039600.0     2.0
2000-01-12      42.8750  42.372768 -39.024390   8589600.0     2.0
2000-01-13      43.0625  42.545201 -35.365854  10917400.0     2.0
2000-01-14      42.3750  42.502651 -66.666667   7882300.0     2.0
2000-01-18      44.0000  42.876988 -23.333333  11717200.0     2.0
         Date     Close
0  2000-01-03  7.667275
1  2000-01-04  7.376112
2  2000-01-05  6.939369
3  2000-01-06  6.842315
4  2000-01-07  6.987896
               Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                         
2000-01-03  7.667275          NaN    NaN      NaN         NaN
2000-01-04  7.376112     7.667275    NaN      NaN   4669559.0
2000-01-05  6.939369     7.376112    NaN      NaN    366805.0
2000-01-06  6.842315     6.939369    NaN      NaN  -7344356.0
2000-01-07  6.987896 

HBox(children=(FloatProgress(value=0.0, max=5076.0), HTML(value='')))


            previous_1d     EMA_7    willr_7       OBV_7  labels
Date                                                            
2000-01-12     6.939369  7.126545 -78.947385 -14734048.0     2.0
2000-01-13     7.084950  7.116146 -58.823594  -5328986.0     2.0
2000-01-14     7.230531  7.144742 -16.666667   6182117.0     0.0
2000-01-17     7.279058  7.178321  -8.333374  14297177.0     2.0
2000-01-18     7.036423  7.142847 -63.636347   8931099.0     2.0
         Date     Close
0  2000-01-04  3.205140
1  2000-01-05  3.157302
2  2000-01-06  3.339086
3  2000-01-07  3.422005
4  2000-01-10  3.488978
               Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                         
2000-01-04  3.205140          NaN    NaN      NaN         NaN
2000-01-05  3.157302     3.205140    NaN      NaN  12127538.0
2000-01-06  3.339086     3.157302    NaN      NaN   2381429.0
2000-01-07  3.422005     3.339086    NaN      NaN  16288228.0
2000-01-10  3.488978     3.4

HBox(children=(FloatProgress(value=0.0, max=4797.0), HTML(value='')))


            previous_1d     EMA_7    willr_7       OBV_7  labels
Date                                                            
2000-01-13     3.189194  3.295804 -69.696967  68412822.0     2.0
2000-01-14     3.179627  3.266760 -84.892114  64666767.0     1.0
2000-01-17     3.147735  3.237004 -92.086330  60502205.0     2.0
2000-01-18     3.192383  3.225849 -83.823604  63818976.0     2.0
2000-01-19     3.176437  3.213496 -87.500110  59733801.0     2.0
         Date  Close
0  2004-03-17   3.37
1  2004-03-18   3.34
2  2004-03-19   3.35
3  2004-03-22   3.35
4  2004-03-23   3.34
            Close  previous_1d  EMA_7  willr_7  OBV_7
Date                                                 
2004-03-17   3.37          NaN    NaN      NaN    NaN
2004-03-18   3.34         3.37    NaN      NaN    0.0
2004-03-19   3.35         3.34    NaN      NaN    0.0
2004-03-22   3.35         3.35    NaN      NaN    0.0
2004-03-23   3.34         3.35    NaN      NaN    0.0
<class 'pandas.core.frame.DataFrame'>
In

HBox(children=(FloatProgress(value=0.0, max=3871.0), HTML(value='')))


            previous_1d     EMA_7     willr_7       OBV_7  labels
Date                                                             
2004-03-26        3.240  3.322857 -100.000000         0.0     2.0
2004-03-29        3.270  3.309643  -72.727273         0.0     2.0
2004-03-30        3.170  3.274732 -100.000000         0.0     1.0
2004-03-31        3.160  3.246049 -100.000000         0.0     2.0
2004-04-01        3.325  3.265787   -8.333267  17474000.0     2.0
         Date     Close
0  2006-10-27  3.453485
1  2006-10-31  3.414241
2  2006-11-01  3.424052
3  2006-11-02  3.433863
4  2006-11-03  3.512352
               Close  previous_1d  EMA_7  willr_7         OBV_7
Date                                                           
2006-10-27  3.453485          NaN    NaN      NaN           NaN
2006-10-31  3.414241     3.453485    NaN      NaN  1.083537e+10
2006-11-01  3.424052     3.414241    NaN      NaN  8.538212e+09
2006-11-02  3.433863     3.424052    NaN      NaN  9.541413e+09
2006-11-0

HBox(children=(FloatProgress(value=0.0, max=3228.0), HTML(value='')))


            previous_1d     EMA_7    willr_7         OBV_7  labels
Date                                                              
2006-11-08     3.541785  3.475911 -19.999878  1.260817e+10     2.0
2006-11-09     3.492730  3.480115 -44.999818  1.209017e+10     2.0
2006-11-10     3.561407  3.500438 -11.110961  1.293841e+10     2.0
2006-11-13     3.581029  3.520586 -10.526309  1.401377e+10     2.0
2006-11-14     3.698761  3.565130  -3.571388  1.523433e+10     2.0


### Non-cyclic Indicators

In [18]:
def compile_data():
    with open('noncyclic_tickers.txt') as f:
        tickers = [i.strip() for i in f.readlines()]
    
    for count,ticker in enumerate(tickers):
        df = pd.read_csv('data/noncyclic_tickers/initial/{}.csv'.format(ticker))
        df  = df.dropna()
            
        high = df['High'].shift(1).values
        low = df['Low'].shift(1).values
        close = df['Close'].shift(1).values
        volume = df['Volume'].shift(1).values

        df.drop(['High','Low','Open','Volume','Adj Close'], 1, inplace=True)
        print(df.head())
            
        df['previous_1d'] = df['Close'].shift(1)


        ema = EMA(close, timeperiod=7)
        arr = np.array(ema)
        df['EMA_7'] = arr


        willr = WILLR(high, low, close, timeperiod=7)
        arr = np.array(willr)
        df['willr_7'] = arr
            
        obv = OBV(close, volume)
        arr = np.array(obv)
        df['OBV_7'] = arr

        df.set_index('Date', inplace=True)
        
        print(df.head())
        print(df.info())
        
        labels = create_labels(df, 'Close', 15)
        df['labels'] = labels
        df.drop(['Close'], 1, inplace=True)
        df  = df.dropna()
        
        print(df.head())
        df.to_csv('data/noncyclic_tickers/initial_indicators/{}_data.csv'.format(ticker))
    
compile_data()

         Date     Close
0  2001-08-27  7.074920
1  2001-08-28  7.335628
2  2001-08-29  7.240101
3  2001-08-30  7.383391
4  2001-08-31  7.365480
               Close  previous_1d  EMA_7  willr_7        OBV_7
Date                                                          
2001-08-27  7.074920          NaN    NaN      NaN          NaN
2001-08-28  7.335628     7.074920    NaN      NaN  204166311.0
2001-08-29  7.240101     7.335628    NaN      NaN  269311618.0
2001-08-30  7.383391     7.240101    NaN      NaN  242553224.0
2001-08-31  7.365480     7.383391    NaN      NaN  266678784.0
<class 'pandas.core.frame.DataFrame'>
Index: 4383 entries, 2001-08-27 to 2019-12-31
Data columns (total 5 columns):
Close          4383 non-null float64
previous_1d    4382 non-null float64
EMA_7          4376 non-null float64
willr_7        4376 non-null float64
OBV_7          4382 non-null float64
dtypes: float64(5)
memory usage: 205.5+ KB
None
Calculating labels


HBox(children=(FloatProgress(value=0.0, max=4383.0), HTML(value='')))


            previous_1d     EMA_7    willr_7        OBV_7  labels
Date                                                             
2001-09-05     7.455036  7.316579 -11.854647  262489717.0     2.0
2001-09-06     7.451055  7.350198 -18.443866  249391619.0     2.0
2001-09-07     7.303786  7.338595 -66.346048  234821217.0     1.0
2001-09-10     7.100792  7.279144 -98.765514  218966871.0     2.0
2001-09-11     7.303786  7.285305 -44.951100  228453119.0     2.0
         Date      Close
0  1999-12-31  45.625000
1  2000-01-03  44.500000
2  2000-01-04  42.062500
3  2000-01-05  42.781250
4  2000-01-06  43.640625
                Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                          
1999-12-31  45.625000          NaN    NaN      NaN         NaN
2000-01-03  44.500000     45.62500    NaN      NaN   1163000.0
2000-01-04  42.062500     44.50000    NaN      NaN  -4894400.0
2000-01-05  42.781250     42.06250    NaN      NaN -10617200.0
2000-01-0

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7       OBV_7  labels
Date                                                             
2000-01-11     47.50000  44.662946  -0.549451  13949400.0     2.0
2000-01-12     45.81250  44.950335 -30.601093  10994400.0     2.0
2000-01-13     45.75000  45.150251 -31.693989   6307000.0     2.0
2000-01-14     47.46875  45.729876  -9.547739  12057800.0     2.0
2000-01-18     49.06250  46.563032 -16.964286  16628800.0     2.0
         Date     Close
0  1999-12-31  54.78125
1  2000-01-03  53.59375
2  2000-01-04  52.56250
3  2000-01-05  51.56250
4  2000-01-06  53.93750
               Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                         
1999-12-31  54.78125          NaN    NaN      NaN         NaN
2000-01-03  53.59375     54.78125    NaN      NaN    614200.0
2000-01-04  52.56250     53.59375    NaN      NaN  -3660800.0
2000-01-05  51.56250     52.56250    NaN      NaN  -7931600.0
2000-01-06  53.93750 

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7       OBV_7  labels
Date                                                             
2000-01-11     57.96875  54.665179  -9.282700  -3535800.0     0.0
2000-01-12     58.87500  55.717634  -0.000000    886000.0     2.0
2000-01-13     58.50000  56.413225  -8.661417  -5345600.0     2.0
2000-01-14     57.50000  56.684919 -21.259843 -10630800.0     2.0
2000-01-18     58.50000  57.138689 -10.328638  -5726400.0     2.0
         Date     Close
0  2000-01-04  4.053544
1  2000-01-05  4.091251
2  2000-01-06  4.255548
3  2000-01-07  4.405031
4  2000-01-10  4.418498
               Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                         
2000-01-04  4.053544          NaN    NaN      NaN         NaN
2000-01-05  4.091251     4.053544    NaN      NaN   5211144.0
2000-01-06  4.255548     4.091251    NaN      NaN  16146401.0
2000-01-07  4.405031     4.255548    NaN      NaN  29138906.0
2000-01-10  4.418498 

HBox(children=(FloatProgress(value=0.0, max=4775.0), HTML(value='')))


            previous_1d     EMA_7    willr_7       OBV_7  labels
Date                                                            
2000-01-13     4.161279  4.231308 -61.904786  59618179.0     2.0
2000-01-14     4.107412  4.200334 -88.235251  53883196.0     1.0
2000-01-17     4.041424  4.160606 -99.147792  49125696.0     2.0
2000-01-18     4.128959  4.152694 -73.007591  52650011.0     2.0
2000-01-19     4.194947  4.163257 -60.411243  56720576.0     2.0
         Date    Close
0  1999-12-31  69.1250
1  2000-01-03  66.8125
2  2000-01-04  64.3125
3  2000-01-05  63.0000
4  2000-01-06  63.6875
              Close  previous_1d  EMA_7  willr_7       OBV_7
Date                                                        
1999-12-31  69.1250          NaN    NaN      NaN         NaN
2000-01-03  66.8125      69.1250    NaN      NaN   2112700.0
2000-01-04  64.3125      66.8125    NaN      NaN  -6257200.0
2000-01-05  63.0000      64.3125    NaN      NaN -13002300.0
2000-01-06  63.6875      63.0000    NaN 

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7       OBV_7  labels
Date                                                             
2000-01-11      67.2500  66.098214 -37.209302 -12213900.0     2.0
2000-01-12      66.2500  66.136161 -40.366972 -17157200.0     2.0
2000-01-13      65.0625  65.867746 -57.407407 -21242200.0     2.0
2000-01-14      65.1250  65.682059 -56.481481 -16221200.0     2.0
2000-01-18      64.5000  65.386544 -71.000000 -22533400.0     2.0
         Date  Close
0  1999-12-31   52.0
1  2000-01-03   54.0
2  2000-01-04   50.0
3  2000-01-05   49.0
4  2000-01-06   48.0
            Close  previous_1d  EMA_7  willr_7   OBV_7
Date                                                  
1999-12-31   52.0          NaN    NaN      NaN     NaN
2000-01-03   54.0         52.0    NaN      NaN    50.0
2000-01-04   50.0         54.0    NaN      NaN  1306.0
2000-01-05   49.0         50.0    NaN      NaN -1794.0
2000-01-06   48.0         49.0    NaN      NaN -6013.0
<class 'pandas.core.frame.

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d      EMA_7    willr_7    OBV_7  labels
Date                                                          
2000-01-11        53.00  51.000000 -14.285714  -2588.0     0.0
2000-01-12        66.00  54.750000  -9.523810  13118.0     2.0
2000-01-13        61.25  56.375000 -32.142857   6962.0     2.0
2000-01-14        65.00  58.531250 -14.285714  13462.0     2.0
2000-01-18        62.00  59.398438 -30.000000   7837.0     2.0
         Date  Close
0  1999-12-31  112.5
1  2000-01-03  115.0
2  2000-01-04  110.0
3  2000-01-05  107.5
4  2000-01-06  112.5
            Close  previous_1d  EMA_7  willr_7   OBV_7
Date                                                  
1999-12-31  112.5          NaN    NaN      NaN     NaN
2000-01-03  115.0        112.5    NaN      NaN   115.0
2000-01-04  110.0        115.0    NaN      NaN  1240.0
2000-01-05  107.5        110.0    NaN      NaN   630.0
2000-01-06  112.5        107.5    NaN      NaN   175.0
<class 'pandas.core.frame.DataFrame'>
Index: 50

HBox(children=(FloatProgress(value=0.0, max=5032.0), HTML(value='')))


            previous_1d       EMA_7    willr_7   OBV_7  labels
Date                                                          
2000-01-11       110.00  111.071429 -50.000000 -2165.0     2.0
2000-01-12       107.50  110.178571 -62.500000 -3480.0     2.0
2000-01-13       107.50  109.508929 -42.857143 -3480.0     2.0
2000-01-14       106.25  108.694196 -41.666667 -4650.0     1.0
2000-01-18       103.75  107.458147 -50.000000 -8790.0     2.0
         Date   Close
0  2004-06-10  2.4375
1  2004-06-11  2.3250
2  2004-06-14  2.3125
3  2004-06-15  2.2750
4  2004-06-16  2.3000
             Close  previous_1d  EMA_7  willr_7        OBV_7
Date                                                        
2004-06-10  2.4375          NaN    NaN      NaN          NaN
2004-06-11  2.3250       2.4375    NaN      NaN  741780948.0
2004-06-14  2.3125       2.3250    NaN      NaN  496140948.0
2004-06-15  2.2750       2.3125    NaN      NaN  369726948.0
2004-06-16  2.3000       2.2750    NaN      NaN  312265896.0

HBox(children=(FloatProgress(value=0.0, max=3834.0), HTML(value='')))


            previous_1d     EMA_7    willr_7        OBV_7  labels
Date                                                             
2004-06-21       2.2875  2.319643 -59.259299  341437896.0     2.0
2004-06-22       2.3625  2.330357 -26.666641  380547896.0     2.0
2004-06-23       2.3625  2.338393 -15.384661  380547896.0     2.0
2004-06-24       2.5500  2.391295  -0.000000  455715896.0     2.0
2004-06-25       2.5500  2.430971 -18.750007  455715896.0     0.0
         Date      Close
0  2013-09-18  11.264000
1  2013-09-23  12.320000
2  2013-09-24  11.896000
3  2013-09-25  12.013333
4  2013-09-26  11.466666
                Close  previous_1d  EMA_7  willr_7        OBV_7
Date                                                           
2013-09-18  11.264000          NaN    NaN      NaN          NaN
2013-09-23  12.320000    11.264000    NaN      NaN  145836577.0
2013-09-24  11.896000    12.320000    NaN      NaN  265280099.0
2013-09-25  12.013333    11.896000    NaN      NaN  190263172.0
201

HBox(children=(FloatProgress(value=0.0, max=1530.0), HTML(value='')))


            previous_1d      EMA_7    willr_7        OBV_7  labels
Date                                                              
2013-10-08    11.530666  11.708190 -53.719010  228335449.0     2.0
2013-10-09    11.997333  11.780476 -33.230320  270009112.0     2.0
2013-10-10    11.914666  11.814023 -52.452039  238863795.0     2.0
2013-10-11    12.189333  11.907851 -30.490408  297460023.0     2.0
2013-10-14    12.141333  11.966221 -21.227643  267573401.0     2.0


## Normalize indicators

In [2]:
import pandas as pd
from sklearn import preprocessing

In [3]:
# normalize all data columns 

with open('cyclic_tickers.txt') as f:
    tickers = [i.strip() for i in f.readlines()]
    
for count,ticker in enumerate(tickers):
    df = pd.read_csv('data/cyclic_tickers/initial_indicators/{}_data.csv'.format(ticker))
    df  = df.dropna()
    
    df.set_index('Date', inplace=True)
    df_target = pd.DataFrame(df['labels'])
    df.drop(['labels'], axis='columns', inplace=True)
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns = df.columns, index=df.index)
    df_normalized = pd.concat([df,df_target],axis=1)
    print(df_normalized.head())
    
    df_normalized.to_csv('data/cyclic_tickers/normalized/{}_data.csv'.format(ticker))

            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2000-01-12     0.898039  0.958972  0.604167  0.653426     2.0
2000-01-13     0.960784  0.965065  0.770834  0.672066     2.0
2000-01-14     0.952941  0.967502  0.878049  0.663181     2.0
2000-01-17     0.952941  0.969329  0.791667  0.663181     2.0
2000-01-18     0.905882  0.957906  0.352941  0.657046     2.0
            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2000-01-11     0.262500  0.263239  0.266667  0.969440     1.0
2000-01-12     0.253334  0.259496  0.219299  0.966588     2.0
2000-01-13     0.255710  0.257295  0.380952  0.969055     2.0
2000-01-14     0.261482  0.257120  0.583334  0.971402     2.0
2000-01-18     0.275401  0.260547  0.750000  0.974704     2.0
            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2000-01-

In [6]:
# normalize all data columns 

with open('noncyclic_tickers.txt') as f:
    tickers = [i.strip() for i in f.readlines()]
    
for count,ticker in enumerate(tickers):
    df = pd.read_csv('data/noncyclic_tickers/initial_indicators/{}_data.csv'.format(ticker))
    df  = df.dropna()
    
    df.set_index('Date', inplace=True)
    df_target = pd.DataFrame(df['labels'])
    df.drop(['labels'], axis='columns', inplace=True)
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns = df.columns, index=df.index)
    df_normalized = pd.concat([df,df_target],axis=1)
    print(df_normalized.head())
    
    df_normalized.to_csv('data/noncyclic_tickers/normalized/{}_data.csv'.format(ticker))

            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2001-09-05     0.001975  0.001848  0.881454  0.077492     2.0
2001-09-06     0.001972  0.001875  0.815561  0.066254     2.0
2001-09-07     0.001852  0.001866  0.336540  0.053753     1.0
2001-09-10     0.001687  0.001817  0.012345  0.040150     2.0
2001-09-11     0.001852  0.001822  0.550489  0.048289     2.0
            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2000-01-11     0.072886  0.061669  0.994505  0.291234     2.0
2000-01-12     0.066815  0.062713  0.693989  0.283493     2.0
2000-01-13     0.066590  0.063439  0.683060  0.271214     2.0
2000-01-14     0.072773  0.065544  0.904523  0.286279     2.0
2000-01-18     0.078507  0.068570  0.830357  0.298253     2.0
            previous_1d     EMA_7   willr_7     OBV_7  labels
Date                                                         
2000-01-