In [51]:
import pandas as pd

# STEP 1: Get the dataframe

In [52]:
df = pd.read_csv('/Users/omarhamdi/code/Stanislas-Motte/MLStocks4Everyone/raw_data/AEX.csv')

In [53]:
df

Unnamed: 0,DATETIME,price
0,2009-08-18 23:00:00,121.97
1,2009-08-19 23:00:00,121.67
2,2009-08-20 23:00:00,125.77
3,2009-08-21 23:00:00,132.17
4,2009-08-24 23:00:00,135.37
...,...,...
63132,2021-03-08 17:00:00,667.00
63133,2021-03-08 17:35:00,666.60
63134,2021-03-08 18:00:00,663.70
63135,2021-03-08 19:00:00,663.65


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63137 entries, 0 to 63136
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DATETIME  63137 non-null  object 
 1   price     60928 non-null  float64
dtypes: float64(1), object(1)
memory usage: 986.6+ KB


In [55]:
df['DATETIME'] = pd.to_datetime(df['DATETIME'])

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63137 entries, 0 to 63136
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DATETIME  63137 non-null  datetime64[ns]
 1   price     60928 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 986.6 KB


In [59]:
X = pd.DataFrame(df['DATETIME'])
y = pd.DataFrame(df['price'])

In [60]:
X

Unnamed: 0,DATETIME
0,2009-08-18 23:00:00
1,2009-08-19 23:00:00
2,2009-08-20 23:00:00
3,2009-08-21 23:00:00
4,2009-08-24 23:00:00
...,...
63132,2021-03-08 17:00:00
63133,2021-03-08 17:35:00
63134,2021-03-08 18:00:00
63135,2021-03-08 19:00:00


In [61]:
y

Unnamed: 0,price
0,121.97
1,121.67
2,125.77
3,132.17
4,135.37
...,...
63132,667.00
63133,666.60
63134,663.70
63135,663.65


In [62]:
split_index = int(0.7*len(df))

In [70]:
X_train = df.iloc[:split_index]
X_test = df.iloc[split_index:]

In [71]:
y_train = X_train['price']
y_test = X_test['price']

In [72]:
X_train = X_train.drop(columns=['price'])
X_test = X_test.drop(columns=['price'])

In [42]:
date = pd.date_range(start='2009-08-18', periods=100)

In [43]:
prices = pd.Series(range(100), index=date, name='price')

# STEP 2: Create the Features

In [52]:
# Create a column with the average of the last three values of price
df['AVG_PRICE'] = df['price'].rolling(window=3).mean()

In [60]:
train = df[df['DATETIME'] < '2020-01-01']
test = df[df['DATETIME'] >= '2020-01-01']

# Compute standard deviation of price over past 20 days and add to price

for i in range(1,11):
    df[f'BB_SellSignal {i} %'] = df['price'] > (df['price'] + df['price'].rolling(window=20).std() * i/10)
    df[f'BB_BuySignal {i} %'] = df['price'] < (df['price'] - df['price'].rolling(window=20).std() * i/10)

In [61]:
df['HowMuchAbove3dAverage'] = df['price'] - df['AVG_PRICE']

In [62]:
df

Unnamed: 0,DATETIME,price,AVG_PRICE,BB_SellSignal 1 %,BB_BuySignal 1 %,BB_SellSignal 2 %,BB_BuySignal 2 %,BB_SellSignal 3 %,BB_BuySignal 3 %,BB_SellSignal 4 %,...,BB_BuySignal 6 %,BB_SellSignal 7 %,BB_BuySignal 7 %,BB_SellSignal 8 %,BB_BuySignal 8 %,BB_SellSignal 9 %,BB_BuySignal 9 %,HowMuchAbove3dAverage,BB_SellSignal 10 %,BB_BuySignal 10 %
0,2009-08-18 23:00:00,121.97,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,,False,False
1,2009-08-19 23:00:00,121.67,,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,,False,False
2,2009-08-20 23:00:00,125.77,123.136667,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,2.633333,False,False
3,2009-08-21 23:00:00,132.17,126.536667,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,5.633333,False,False
4,2009-08-24 23:00:00,135.37,131.103333,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,4.266667,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63132,2021-03-08 17:00:00,667.00,666.050000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0.950000,False,False
63133,2021-03-08 17:35:00,666.60,666.216667,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0.383333,False,False
63134,2021-03-08 18:00:00,663.70,665.766667,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,-2.066667,False,False
63135,2021-03-08 19:00:00,663.65,664.650000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,-1.000000,False,False


# Step 3: Create your target variable : what you want to predict --> Future returns %

In [None]:
#Make target the daily returns
df['target'] = ((df['Price'] / df['price'].shift(-1)) - 1)*100

In [71]:
df['price']

0        121.97
1        121.67
2        125.77
3        132.17
4        135.37
          ...  
63132    667.00
63133    666.60
63134    663.70
63135    663.65
63136    662.80
Name: price, Length: 63137, dtype: float64

In [73]:
df['price'].shift(-1)

0        121.67
1        125.77
2        132.17
3        135.37
4        137.57
          ...  
63132    666.60
63133    663.70
63134    663.65
63135    662.80
63136       NaN
Name: price, Length: 63137, dtype: float64

In [None]:
for i in range(1,30):
    df['target'] = ((df['Price'] / df['price'].shift(-i)) - 1)*100

In [None]:
# Install pandas ta with pip install pandas_ta

In [75]:
import pandas_ta as ta

In [77]:
# Create a DataFrame so 'ta' can be used.
df = pd.DataFrame()

# Help about this, 'ta', extension
#help(df.ta)

# List of all indicators
df.ta.indicators()

Pandas TA - Technical Analysis Indicators - v0.3.14b0
Total Indicators & Utilities: 205
Abbreviations:
    aberration, above, above_value, accbands, ad, adosc, adx, alma, amat, ao, aobv, apo, aroon, atr, bbands, below, below_value, bias, bop, brar, cci, cdl_pattern, cdl_z, cfo, cg, chop, cksp, cmf, cmo, coppock, cross, cross_value, cti, decay, decreasing, dema, dm, donchian, dpo, ebsw, efi, ema, entropy, eom, er, eri, fisher, fwma, ha, hilo, hl2, hlc3, hma, hwc, hwma, ichimoku, increasing, inertia, jma, kama, kc, kdj, kst, kurtosis, kvo, linreg, log_return, long_run, macd, mad, massi, mcgd, median, mfi, midpoint, midprice, mom, natr, nvi, obv, ohlc4, pdist, percent_return, pgo, ppo, psar, psl, pvi, pvo, pvol, pvr, pvt, pwma, qqe, qstick, quantile, rma, roc, rsi, rsx, rvgi, rvi, short_run, sinwma, skew, slope, sma, smi, squeeze, squeeze_pro, ssf, stc, stdev, stoch, stochrsi, supertrend, swma, t3, td_seq, tema, thermo, tos_stdevall, trima, trix, true_range, tsi, tsignals, ttm_trend, ui, 

In [78]:
# Help about an indicator such as bbands
help(ta.accbands)

Help on function accbands in module pandas_ta.volatility.accbands:

accbands(high, low, close, length=None, c=None, drift=None, mamode=None, offset=None, **kwargs)
    Acceleration Bands (ACCBANDS)
    
    Acceleration Bands created by Price Headley plots upper and lower envelope
    bands around a simple moving average.
    
    Sources:
        https://www.tradingtechnologies.com/help/x-study/technical-indicator-definitions/acceleration-bands-abands/
    
    Calculation:
        Default Inputs:
            length=10, c=4
        EMA = Exponential Moving Average
        SMA = Simple Moving Average
        HL_RATIO = c * (high - low) / (high + low)
        LOW = low * (1 - HL_RATIO)
        HIGH = high * (1 + HL_RATIO)
    
        if 'ema':
            LOWER = EMA(LOW, length)
            MID = EMA(close, length)
            UPPER = EMA(HIGH, length)
        else:
            LOWER = SMA(LOW, length)
            MID = SMA(close, length)
            UPPER = SMA(HIGH, length)
    
   