## GOAL OF PROJECT

To implement the following paper "Predicting the direction of stock market prices using random forest" - Luckyson Khaidem, Snehanshu Saha, Sudeepa Roy Dey

### IMPORTING THE LIBRARIES REQUIRED FOR THE TASK 

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

## The model of choice for training and prediction purposes is the Random Forest Classifier

### Importing required libraries for training and testing the model

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, confusion_matrix, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import TimeSeriesSplit

### In order to use the technical financial indicators required in the paper, we utilise a pre-defined library as follows:

## https://github.com/Crypto-toolbox/pandas-technical-indicators/blob/master/technical_indicators.py

#### Importing the above library

In [3]:
import pandas_techinal_indicators as ta 

## Data

For our purpose of implementing the paper, we are using closing stock price of CIPLA which has been obtained from 
NSE dataset

In [4]:
raw_data = pd.read_csv('CIPLA.csv')
raw_data2 = pd.read_csv('CIPLA.csv')
del(raw_data['Series'])
del(raw_data['Date'])
del(raw_data['Symbol'])
del(raw_data['N'])


## Exponential Smoothing 

As indicated by the authors of the paper, this is done for the purpose of putting more importance on recent data and exponentially decreasing weightage to past data 

In [5]:
# Function for exponentially smoothing

def exp_smoothing(df, alpha):
    es_data = df.ewm(alpha=alpha).mean()    
    return es_data

In [6]:
# For current testing purposes, value of alpha used is 0.9

sdata = exp_smoothing(raw_data, 0.9)

# Let us visualise the data

sdata.head() 

Unnamed: 0,Prev Close,Open,High,Low,Last,Close,Average,Volume,Turnover,No. of Trades,Deliverable Qty,% Dly Qt to Traded Qty
0,626.4,626.5,634.8,626.5,629.95,628.4,630.72,596787.0,376407400.0,9333.0,201185.0,33.71
1,628.218182,632.409091,637.3,629.090909,630.904545,629.990909,632.756364,680616.1,430680000.0,10921.181818,320182.272727,46.882727
2,629.958559,626.635135,637.930631,626.306306,632.251802,632.701802,633.417297,1223210.0,774840900.0,21567.864865,652569.414414,53.042432
3,632.69613,630.563906,632.592529,612.079208,615.778533,616.183528,621.1245,1576297.0,978597200.0,53017.131413,956747.259226,60.483987
4,616.184448,615.971245,619.27912,605.257853,616.067856,612.373315,612.462363,890165.3,545805300.0,36130.144271,429642.354874,47.079265


## Technical indicators used for Feature Extraction


In [7]:

def feature_extraction(data):
    for x in [5, 14, 26, 44, 66]:
        data = ta.relative_strength_index(data, n=x)
        data = ta.stochastic_oscillator_d(data, n=x)
        data = ta.accumulation_distribution(data, n=x)
        data = ta.average_true_range(data, n=x)
        data = ta.momentum(data, n=x)
        data = ta.money_flow_index(data, n=x)
        data = ta.rate_of_change(data, n=x)
        data = ta.on_balance_volume(data, n=x)
        data = ta.commodity_channel_index(data, n=x)
        data = ta.ease_of_movement(data, n=x)
        data = ta.trix(data, n=x)
        data = ta.vortex_indicator(data, n=x)
        data = ta.moving_average(data, n=x)
        data = ta.standard_deviation(data, n=x) 
        data = ta.keltner_channel(data, n=x)
        data = ta.coppock_curve(data, n=x)
        data = ta.force_index(data, n=x)
        data = ta.bollinger_bands(data, n=x)
        data = ta.exponential_moving_average(data, n=x)
    
    data = ta.ppsr(data)
    data = ta.stochastic_oscillator_k(data)
    data = ta.mass_index(data)
    data = ta.ultimate_oscillator(data)
    data['ema50'] = data['Close'] / data['Close'].ewm(50).mean()
    data['ema21'] = data['Close'] / data['Close'].ewm(21).mean()
    data['ema14'] = data['Close'] / data['Close'].ewm(14).mean()
    data['ema5'] = data['Close'] / data['Close'].ewm(5).mean()
    data = ta.chaikin_oscillator(data)    
  
    data = ta.macd(data, n_fast=12, n_slow=26)
    
    del(data['Open'])
    del(data['Prev Close'])
    del(data['High'])
    del(data['Low'])
    del(data['Volume'])
    del(data['Last'])
    del(data['Average'])
    del(data['Turnover'])
    del(data['No. of Trades'])
    del(data['Deliverable Qty'])
    del(data['% Dly Qt to Traded Qty'])
    
    return data
   
def compute_prediction_int(df, n):
    pred = (df.shift(-n)['Close'] >= df['Close'])
    pred = pred.iloc[:-n]
    return pred.astype(int)

def prepare_data(df, horizon):
    data = feature_extraction(df).dropna().iloc[:-horizon]
    data['pred'] = compute_prediction_int(data, n=horizon)
    del(data['Close'])
    return data.dropna()

## Preparation of training data and labels.

### Assume a prediction horizon of 1 day

In [8]:
data = prepare_data(sdata, 1)

## Identifying and extracting the label
y = data['pred']

## Extracting the input features and creating the input feature matrix
input_feature = [x for x in data.columns if x not in ['gain', 'pred']]
X = data[input_feature]
data

Unnamed: 0,RSI_5,SO%d_5,Acc/Dist_ROC_5,ATR_5,Momentum_5,MFI_5,ROC_5,OBV_5,CCI_5,EoM_5,...,Ultimate_Osc,ema50,ema21,ema14,ema5,Chaikin,MACD_12_26,MACDsign_12_26,MACDdiff_12_26,pred
196,0.436350,0.517157,-1.677269,15.492787,-8.340445,0.4,-0.024447,4.047872e+05,-0.821078,-0.000014,...,3.995107,1.011413,1.011880,1.012153,1.003726,-2.403691e+05,4.629808,0.653490,3.976318,1.0
197,0.436350,0.618265,-2.493487,14.278453,-9.924045,0.4,0.001633,3.940154e+05,-0.233535,-0.000023,...,3.906250,1.021371,1.021254,1.021028,1.011682,1.978582e+05,5.104750,1.543742,3.561008,1.0
198,0.720106,0.710163,5.054578,13.723961,9.897596,0.6,0.012624,1.033682e+06,1.627030,-0.000010,...,4.235651,1.033886,1.032863,1.031897,1.020570,5.134208e+05,6.119960,2.458986,3.660974,0.0
199,0.288671,0.640388,-0.991637,18.884807,-3.915240,0.6,-0.006159,-2.647630e+05,-0.410050,-0.000010,...,4.049310,1.014802,1.013455,1.012268,1.001695,6.591250e+04,5.849583,3.137105,2.712478,1.0
200,0.288671,0.620287,-1.526733,16.533421,-1.471524,0.8,0.008822,-4.144720e+04,-0.085541,0.000014,...,4.320980,1.018494,1.016716,1.015229,1.004761,-7.029353e+03,5.787382,3.667160,2.120221,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327,0.513883,0.194431,0.772932,20.773047,-21.442333,0.4,-0.027894,-3.895424e+06,-0.763298,-0.000009,...,1.516716,1.143221,1.064560,1.025469,0.983922,-1.426973e+06,18.301308,27.863590,-9.562282,1.0
1328,0.816296,0.407041,-3.395735,24.827812,10.815767,0.6,0.044830,1.657070e+06,1.554971,-0.000004,...,2.226822,1.193083,1.109484,1.069046,1.025550,5.537347e+06,18.382145,25.967301,-7.585156,0.0
1329,0.854976,0.388101,-0.613557,23.229786,23.401577,0.8,0.043549,9.576282e+05,1.337165,0.000007,...,2.287043,1.184246,1.100072,1.060453,1.018033,2.084229e+06,18.057936,24.385428,-6.327492,1.0
1330,0.922523,0.537042,-2.991880,24.839315,47.745158,0.8,0.084754,5.668300e+06,1.371174,0.000008,...,2.767795,1.224305,1.135062,1.093881,1.047228,4.466501e+06,19.425460,23.393435,-3.967974,0.0


### Scaling the data into range (-1,1) for pre-processing  

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_new = scaler.fit_transform(X)

print("This is X_new: ")
print(X_new)

This is X_new: 
[[0.43757946 0.58298818 0.17150661 ... 0.44859092 0.39947093 0.41797646]
 [0.43757946 0.7368325  0.17139336 ... 0.45453082 0.41199949 0.40560487]
 [0.72245626 0.87666243 0.17244067 ... 0.46722761 0.42487975 0.40858274]
 ...
 [0.85785864 0.38661711 0.1716542  ... 0.61653084 0.73345156 0.11103819]
 [0.92567263 0.61324432 0.1713242  ... 0.63363389 0.71949119 0.18132542]
 [0.93743254 0.53902674 0.17163659 ... 0.64291393 0.71041137 0.2226486 ]]


## HERE, instead of splitting into training and testing radnomly (as is done usually), we are using the TimeSeriesSplit method to split the data into training and testing data

### This is because, since we are using time series data, random splitting may lead to data leakage

In [10]:
tscv = TimeSeriesSplit()

for train_index, test_index in tscv.split(X_new):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_new[train_index, :], X_new[test_index,:]
    
for train_index, test_index in tscv.split(y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

## Printing sizes to verify correctness
print('len X_train', len(X_train))
print('len y_train', len(y_train))
print('len X_test', len(X_test))
print('len y_test', len(y_test))

len X_train 947
len y_train 947
len X_test 189
len y_test 189


## MODEL - Random Forests 

In [11]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=80, random_state=42)


## Training and testing the model

In [12]:
model.fit(X_train, y_train.values.ravel());

prediction = model.predict(X_test)

accuracy = accuracy_score(y_pred=prediction, y_true=y_test)

print('Accuracy: {0:1.2f}'.format(accuracy))


confusion = confusion_matrix(y_pred=prediction, y_true=y_test)
print('Confusion Matrix')
print(confusion)


precision = precision_score(y_pred=prediction, y_true=y_test)
recall = recall_score(y_pred=prediction, y_true=y_test)
f1 = f1_score(y_pred=prediction, y_true=y_test)
print('Precision: {0:1.2f}, Recall: {1:1.2f}, f1: {2:1.2f}'.format(precision, recall, f1))


Accuracy: 0.52
Confusion Matrix
[[41 61]
 [29 58]]
Precision: 0.49, Recall: 0.67, f1: 0.56
