In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.backend import sigmoid
import matplotlib.pyplot as plt
from math import sqrt
import threading
from multiprocessing.pool import ThreadPool
import time
import multiprocessing
import traceback

import tensorflow as tf

import prettytable
from prettytable import PrettyTable

import statsmodels.api as sm

from sklearn.model_selection import GridSearchCV

from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.metrics import precision_score, make_scorer
from keras.backend import sigmoid

In [3]:
import warnings; warnings.simplefilter('ignore')

In [4]:
def swish(x, beta = 1):
    return (x * sigmoid(beta * x))

In [5]:
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Activation
get_custom_objects().update({'swish': Activation(swish)})

In [6]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

In [7]:
from tune_sklearn import TuneGridSearchCV

# Pre-process the data

In [8]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data

In [9]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

# Reshaping the data

In [10]:
def reshape_data_r(x_train, x_test, y_train, y_test):
    x_train = np.array(x_train)
    x_train = np.reshape(x_train, (x_train.shape[0],1, x_train.shape[1]))
    x_test = np.array(x_test)
    x_test = np.reshape(x_test, (x_test.shape[0],1, x_test.shape[1]))
    y_test = np.array(y_test)
    y_test = np.reshape(y_test, (y_test.shape[0],1))

    return (x_train, x_test, y_train, y_test)

# Feature Selection Methods

In [11]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [12]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

# RNN Methods

In [13]:
def recurrent_neural_networks(X,Y):
    
#     x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state = 0)
    x_train, x_test, y_train, y_test = split_dataset(X,Y,0.70)
#     x_train, x_test, y_train, y_test = reshape_data(x_train, x_test, y_train, y_test)
    x_train, x_test, y_train, y_test = reshape_data(x_train, x_test, y_train, y_test,units = 60)
    
    
    epochs, batch_size = best_parameters_rnn(x_train, y_train)
    
    input_dim = (x_train.shape[1], x_train.shape[2])
    print(input_dim, "ind")
    
#     model = KerasRegressor(build_fn = build_lstm, batch_size=batch_size, epochs=epochs,input_dim=input_dim)
    
    model = KerasRegressor(build_fn = build_lstm, input_shape=input_dim,verbose=2)
    
    history_lstm = model.fit(x_train,y_train,epochs = epochs,batch_size = batch_size, validation_data = (x_test,y_test),shuffle = False,verbose=0)
    
    y_pred = model.predict(x_test) 
    y_pred = np.array(y_pred)
    y_pred = np.reshape(y_pred, (y_pred.shape[0],1))
    
    myres = {}
    error = error_metrics(y_test, y_pred)
    myres.update(error)
    
    c = 0
    for a,b in zip(y_pred,y_test):
        if a * b >= 0:
            c += 1
    direction = c/len(y_test)
    
    myres.update({"epochs" : epochs, "batch_size" : batch_size, "layers" : layers, "Percentage":direction})
    print("done")
    return myres

In [14]:
def build_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(units=32,return_sequences=True,input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=32,return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=32))
    model.add(Dropout(0.2))
    model.add(Dense(units = 1,activation = 'swish'))
    model.compile(loss='mse', optimizer="Adam", metrics = ['accuracy'])
    return model

In [15]:
def keras_model(input_dim,units = 8):
    activation='relu'
    dropout_rate=0.2
    init_mode='uniform'
    weight_constraint=0 
    optimizer='adam' 
    lr = 0.01
    momemntum=0
    
    model = Sequential()
    model.add(Dense(units = units, input_dim = input_dim, kernel_initializer=init_mode,activation=activation))
    model.add(Dropout(dropout_rate)) 
    model.add(Dense(1, kernel_initializer=init_mode, activation='swish'))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

    return model

In [16]:
def best_parameters_rnn(X,Y):
    
    batch_size = [25,32,48,64,100]
    epochs = [25,50,75,100]
    param_grid = dict(epochs=epochs,batch_size=batch_size)
    input_dim = (X.shape[1], X.shape[2])
    model = KerasRegressor(build_fn = build_lstm, input_shape=input_dim,verbose = 2) 

    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1,verbose=2)
    grid_result = grid.fit(X,Y) 
    epoch = grid_result.best_params_['epochs']
    batch_size = grid_result.best_params_['batch_size']
    
    return (epoch,batch_size)

In [17]:
def split_dataset(X,Y,t):
    tr = int(len(X)*t)
    tt = len(X) - tr
    xtr = X[:tr]
    xtt = X[tr:tr+tt]
    ytr = Y[:tr]
    ytt = Y[tr:tr+tt]
    return (xtr,xtt,ytr,ytt)

In [18]:
def reshape_data(x_train, x_test, y_train, y_test,units = 60):
    my_x_train = list()
    my_y_train = list()
    my_x_test = list()
    my_y_test = list()
    for i in range(x_train.shape[0]-units):
        my_x_train.append(x_train.iloc[i:i+units,:])
        my_y_train.append(y_train.iloc[i+units,])
    
    my_x_train = np.array(my_x_train)
    my_x_train = np.reshape(my_x_train,(my_x_train.shape[0],my_x_train.shape[1],my_x_train.shape[2]))
    
    my_y_train = np.array(my_y_train)
    my_y_train = np.reshape(my_y_train,(my_y_train.shape[0],1))
    
    for i in range(x_test.shape[0]-units):
        my_x_test.append(x_test.iloc[i:i+units,:])
        my_y_test.append(y_test.iloc[i+units,])
        
    my_x_test = np.array(my_x_test)
    my_x_test = np.reshape(my_x_test,(my_x_test.shape[0],my_x_test.shape[1],my_x_test.shape[2]))
    
    my_y_test = np.array(my_y_test)
    my_y_test = np.reshape(my_y_test,(my_y_test.shape[0],1))
    
    return (my_x_train, my_x_test, my_y_train, my_y_test)

In [19]:
def error_metrics(y_true, y_pred):
    rmse = sqrt(metrics.mean_squared_error(y_true, y_pred))
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    return {"RMSE":rmse,"MAE":mae,"MSE":mse, "rsquared_adj" : r2}

# Finding results from each set of important features

In [20]:
def fit_RNN(df, column, method, name, results):
    print("RNN Model fitted using columns obtained from feature importance using " + method + " : ")
    X = df[df.columns[:-1]]
    Y = df[column]
    
    model_result = recurrent_neural_networks(X, Y)
        
    results["RNN_Regression_FI_" + method] = model_result["Percentage"]
    
    print("Maximum percentage of correct direction : ", model_result["Percentage"])
    
    create_pretty_table(name , "RNN_Regression", method , model_result)
    

In [21]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_RNN(df_fs, column, method, name, results)
    print("*****************************************************************************************")

In [22]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_RNN(df_be, column, method, name, results)
    print("*****************************************************************************************")

In [23]:
def get_results_from_FI_AllFeatures(df1, name, column, results):
    print("All Features are considered : ")
    print("*****************************************************************************************")
    method = "AllFeaturesConsideration"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    all_features = list(X.columns)
    print("All Features are --->>") 
    print("--------------------------------------")
    print(all_features)
    if (len(all_features) != 0):
        all_features.append(column)
        df_all = df1[all_features]
        fit_RNN(df_all, column, method, name, results)
    print("*****************************************************************************************")

In [24]:
def get_results_from_each_set(data, name, final_df):
    df = pre_process_data(data, 60)
    column = "Next Day Close Price GR"
    (df1, column) = dependent_column(df, column)
    results = {}
    get_results_from_FI_ForwardSelection(df1, name, column, results)
    get_results_from_FI_BackwardElimination(df1, name, column, results)
    get_results_from_FI_AllFeatures(df1, name, column, results)
    sorted_results = sorted(results.items(), key=lambda item: item[1])
    max_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'RNN-Regression', 'Method' : sorted_results[-1][0], 'Percentage' : sorted_results[-1][1]}
    final_df = final_df.append(max_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with a percentage of {}.".format(sorted_results[-1][0], sorted_results[-1][1]))
    return final_df

# Process of getting results

In [25]:
def create_pretty_table(name, model, method, result):
    values = [name[2 : 8 ] + "-" + companies[name[2 : 8]], method] + [round(v, 6) if (not isinstance(v, str)) else v for k,v in result.items()]
    tables[model].add_row(values)
    tables[model].title = model

In [26]:
columns =['Company','Method','RMSE','MAE','MSE','rsquared_adj','epochs','batch_size','layers','Percentage']

In [27]:
companies = {"500112" : "SBIN" ,
"500325" : "RELIANCE INDUSTRIES LTD",
"532540" : "TATA CONSULTANCY SERVICES LTD" ,
"500209" : "INFOSYS LTD", 
"532174" : "ICICI BANK LTD", 
"507685" : "WIPRO LTD", 
"530965" : "INDIAN OIL CORPORATION LTD", 
"500182" : "HERO MOTOCORP LTD", 
"532210" : "CITY UNION BANK LTD", 
"500180" : "HDFC Bank Ltd",
"500680" : "PFIZER LTD", 
"506395" : "COROMANDEL iNTERNATIONAL LTD",
"500770" : "TATA CHEMICALS LTD", 
"500085" : "CHAMBAL FERTILISERS & CHEMICALS LTD", 
"501425" : "BOMBAY BURMAH TRADING CORP.LTD", 
"532899" : "KAVERI SEED COMPANY LTD", 
"537291" : "NATH BIO-GENES (INDIA) LTD", 
"500790" : "NESTLE INDIA LTD", 
"500825" : "BRITANNIA INDUSTRIES LTD", 
"533155" : "JUBILANT FOODWORKS LTD", 
"533287" : "ZEE LEARN LTD", 
"533260" : "CAREER POINT LTD", 
"539921" : "SHANTI EDUCATIONAL INITIATIVES LTD", 
"542602" : "EMBASSY OFFICE PARKS REIT", 
"543217" : "MINDSPACE BUSINESS PARKS REIT", 
"543261" : "BROOKFIELD INDIA REAL ESTATE TRUST REIT", 
"532538" : "ULTRATECH CEMENT LTD", 
"500387" : "SHREE CEMENT LTD", 
"500425" : "AMBUJA CEMENTS LTD", 
"532689" : "PVR LTD", 
"532706" : "INOX LEISURE LTD", 
"532163" : "SAREGAMA INDIA LTD", 
"524715" : "SUN PHARMACEUTICAL INDUSTRIES LTD", 
"532488" : "DIVI'S LABORATORIES LTD",
"500124" : "DR.REDDY'S LABORATORIES LTD"}

In [28]:
models = ["RNN_Regression"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [29]:
final_df = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Final_Results_df.csv")
# final_df.drop('Unnamed: 0', inplace = True, axis = 'columns')
final_df

Unnamed: 0,Company,Model,Method,Percentage
0,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,GNN-Regression,GNN_Regression_FI_AllFeaturesConsideration_std...,0.654034
1,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,KNN-Regression,KNN_Regression_FI_BackwardElimination_,0.540342
2,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Ridge Regression,RidgeFIFValue1,0.506112
3,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Linear Regression,LinearFIBackwardElimination,0.501222
4,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Lasso Regression,LassoFIFValue10,0.497555
...,...,...,...,...
219,542602-EMBASSY OFFICE PARKS REIT,Linear Regression,LinearFICoefficients0.1,0.538462
220,542602-EMBASSY OFFICE PARKS REIT,Ridge Regression,RidgeFICoefficients0.1,0.538462
221,542602-EMBASSY OFFICE PARKS REIT,KNN-Regression,KNN_Regression_FI_AllFeaturesConsideration_,0.530769
222,542602-EMBASSY OFFICE PARKS REIT,GNN-Regression,GNN_Regression_FI_ForwardSelection_stds_0.9,0.507692


In [None]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr500085"):
        df_rnn = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[2 : 8]
        fd_df = pd.DataFrame(columns = final_df.columns)
        print("For stock : ", stock)
        print("#################################################################################################################")
        f_df = get_results_from_each_set(df_rnn, name, fd_df)
        final_df = final_df.append(f_df, ignore_index = True)
        print("#################################################################################################################")
        break
        
# final_df = final_df.sort_values(by = ['Company', 'Percentage'], ascending = [True, False])
# final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Final_Results_df.csv', index = False)

For stock :  500085
#################################################################################################################
Features Importance using Forward Selection Method
*****************************************************************************************
Features obtained from Forward Selection method : 
--------------------------------------
['Beta GR']
RNN Model fitted using columns obtained from feature importance using ForwardSelection : 
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
for name,table in tables.items():
    print(table)

In [None]:
final_df