In [1]:
# %pip install neupy

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import warnings; warnings.simplefilter('ignore')

In [4]:
import numpy as np
import pandas as pd
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from neupy import algorithms
from sklearn import metrics
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from multiprocessing.pool import ThreadPool
from prettytable import PrettyTable

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm



# Pre-process the data

In [5]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on the dependent column

In [6]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

# Feature Selection Methods

In [7]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [8]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

# GNN and its best parameters function

In [9]:
# np.arange(1e-1, 1, 1e-2)
# np.linspace(0, 1, 11)
# a = np.array(1)
# type(a)

In [10]:
def bestParameters(X,Y,std):
#     model = RandomizedSearchCV(algorithms.GRNN(std=std, verbose=False), param_distributions={'std': np.arange(1e-2, 1, 1e-3)}, scoring='neg_mean_squared_error',)
#     model = GridSearchCV(algorithms.GRNN(std = 0.1, verbose=False), param_grid={'std': std}, scoring='neg_mean_squared_error',verbose = 2)
    model = GridSearchCV(algorithms.GRNN(std = std, verbose = False), param_grid={'std': np.arange(1e-2, 1, 1e-3)}, scoring='neg_mean_squared_error',verbose = 1)
#     model = GridSearchCV(algorithms.GRNN(std = std, verbose = False), param_grid={'std': np.arange(1e-1, 1, 1e-2)}, scoring='neg_mean_squared_error',verbose = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model.fit(X_train,y_train)
    return model.best_params_

In [11]:
# def generalized_neural_network(X,Y,std=0.1):
#     best = bestParameters(X,Y,std)
#     x_train, x_test, y_train, y_test = train_test_split(preprocessing.minmax_scale(X), preprocessing.minmax_scale(Y), test_size=0.3, random_state=0)
#     model = algorithms.GRNN(std=best['std'])
#     model.train(x_train,y_train)
#     y_pred = model.predict(x_test)
#     rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
#     mae = metrics.mean_absolute_error(y_test, y_pred)
#     mse = metrics.mean_squared_error(y_test, y_pred)
#     r2 = metrics.r2_score(y_test, y_pred)
#     c = 0
#     for a,b in zip(y_test, y_pred):
#         if a*b >= 0:
#             c += 1
#     direction = c/len(y_test)
#     myres =  {"RMSE":rmse,"MAE":mae,"MSE":mse,"rsquared_adj":r2,"std":std, "best-std":best['std'],"Percentage":direction}
#     print("done")
#     return myres

In [12]:
def generalized_neural_network(X,Y,name, method, std):
    best = bestParameters(X,Y,std)
    print(best)
    
    # preprocessing.minmax_scale(X) is done to avoid Nan values in the predicted values 
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model = algorithms.GRNN(std=best['std'])
    model.train(x_train,y_train)
    y_pred = model.predict(x_test)
    
#     print(list(zip(y_test, y_pred)))
    pred_actual = pd.DataFrame(list(zip(y_pred, y_test)), 
                   columns =['Predicted Values', 'Actual Values'])
#     pred_actual['Predicted Values'] = pred_actual['Predicted Values'].astype(str)
#     pred_actual['Actual Values'] = pred_actual['Actual Values'].astype(str)
#     print(list(zip(pred_actual['Predicted Values'], pred_actual['Actual Values'])))

#     preds = []
#     for i in y_pred:
#         preds.append(i[0])

#     pred_actual = pd.DataFrame()
#     pred_actual['Predicted Values'] = preds
#     pred_actual['Actual Values'] = y_test
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\" + name[2:8] + "_GRNN_FI_" + method + "_std_" + str(round(best['std'], 6)) +".csv" , index=False) 
    
    y_pred = np.nan_to_num(y_pred)
    
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    c = 0
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
    for a,b in zip(y_test, y_pred):
        if (a > 0 and b[0] > 0) or (a < 0 and b[0] < 0):
            print(a, b[0], a * b[0])
            c += 1
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
#     print("Direction : ")
    print(c, len(y_test), c / len(y_test))
    
    direction = c / len(y_test)
    myres =  {"RMSE":rmse,"MAE":mae,"MSE":mse,"rsquared_adj":r2,"std":std, "best-std":best['std'],"Percentage":direction}
#     print("done")
    return myres

# Finding results from each set of important features

In [13]:
def fit_GNN(df, column, method, name, results):
    print("GRNN Model fitted using columns obtained from feature importance using " + method + " : ")
    X = df[df.columns[:-1]]
    Y = df[column].values
    
    std = 0.1
    model_result = generalized_neural_network(X, Y, name, method, std)
        
    
    results["GRNN_FI_" + method + "_stds_" + str(round(model_result['best-std'], 6))] = model_result["Percentage"]
    
    create_pretty_table(name , "GRNN", method + "-" + str(round(round(model_result['best-std'], 6))), model_result)
    print("Maximum percentage of correct direction : ", model_result["Percentage"])

In [14]:
# def fit_GNN(df, column, method, name, results):
#     print("GRNN Model fitted using columns obtained from feature importance using " + method + " : ")
#     X = df[df.columns[:-1]]
#     Y = df[column].values
    
#     data = list()
#     stds = np.linspace(0,1,11)
#     arguments = list()
#     max_percentage = 0
#     for s in stds:
#         data = [X,Y,s]
#         arguments.append(data)
#         model_result = generalized_neural_network(X, Y, name, method, s)
#         if (max_percentage < model_result["Percentage"]):
#             max_percentage = model_result["Percentage"]
# #     threads = ThreadPool(4)
# #     model_result = threads.starmap(generalized_neural_network,arguments)
    
# #         print(model_result)
    
# #         print("Percentage of correct direction : ", model_result["Percentage"])
        
    
#         results["GRNN_Regression_FI_" + method + "_stds_" + str(round(s, 2))] = model_result["Percentage"]
    
#         create_pretty_table(name , "GRNN_Regression", method + "-" + str(round(s, 2)), model_result)
#     print("Maximum percentage of correct direction : ", max_percentage)

In [15]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_GNN(df_fs, column, method, name, results)
    print("*****************************************************************************************")

In [16]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_GNN(df_be, column, method, name, results)
    print("*****************************************************************************************")

In [17]:
def get_results_from_FI_AllFeatures(df1, name, column, results):
    print("All Features are considered : ")
    print("*****************************************************************************************")
    method = "AllFeaturesConsideration"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    all_features = list(X.columns)
    print("All Features are --->>") 
    print("--------------------------------------")
    print(all_features)
    if (len(all_features) != 0):
        all_features.append(column)
        df_all = df1[all_features]
        fit_GNN(df_all, column, method, name, results)
    print("*****************************************************************************************")

In [18]:
def get_results_from_each_set(data, name, final_df):
    df = pre_process_data(data, 60)
    column = "Next Day Close Price GR"
    (df1, column) = dependent_column(df, column)
    results = {}
    get_results_from_FI_ForwardSelection(df1, name, column, results)
    get_results_from_FI_BackwardElimination(df1, name, column, results)
    get_results_from_FI_AllFeatures(df1, name, column, results)
    sorted_results = sorted(results.items(), key=lambda item: item[1])
    max_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'GRNN', 'Method' : sorted_results[-1][0], 'Percentage' : sorted_results[-1][1]}
    final_df = final_df.append(max_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with a percentage of {}.".format(sorted_results[-1][0], sorted_results[-1][1]))
    return final_df

# Process of getting results

In [19]:
def create_pretty_table(name, model, method, result):
    values = [name[2 : 8 ] + "-" + companies[name[2 : 8]], method] + [round(v, 6) if (isinstance(v, float)) else v for k,v in result.items()]
    tables[model].add_row(values)
    tables[model].title = model

In [20]:
columns =['Company','Method', 'RMSE','MAE','MSE','rsquared_adj','std','best-std','Percentage']

In [21]:
companies = {"500112" : "SBIN" ,
"500325" : "RELIANCE INDUSTRIES LTD",
"532540" : "TATA CONSULTANCY SERVICES LTD" ,
"500209" : "INFOSYS LTD", 
"532174" : "ICICI BANK LTD", 
"507685" : "WIPRO LTD", 
"530965" : "INDIAN OIL CORPORATION LTD", 
"500182" : "HERO MOTOCORP LTD", 
"532210" : "CITY UNION BANK LTD", 
"500180" : "HDFC Bank Ltd",
"500680" : "PFIZER LTD", 
"506395" : "COROMANDEL iNTERNATIONAL LTD",
"500770" : "TATA CHEMICALS LTD", 
"500085" : "CHAMBAL FERTILISERS & CHEMICALS LTD", 
"501425" : "BOMBAY BURMAH TRADING CORP.LTD", 
"532899" : "KAVERI SEED COMPANY LTD", 
"537291" : "NATH BIO-GENES (INDIA) LTD", 
"500790" : "NESTLE INDIA LTD", 
"500825" : "BRITANNIA INDUSTRIES LTD", 
"533155" : "JUBILANT FOODWORKS LTD", 
"533287" : "ZEE LEARN LTD", 
"533260" : "CAREER POINT LTD", 
"539921" : "SHANTI EDUCATIONAL INITIATIVES LTD", 
"542602" : "EMBASSY OFFICE PARKS REIT", 
"543217" : "MINDSPACE BUSINESS PARKS REIT", 
"543261" : "BROOKFIELD INDIA REAL ESTATE TRUST REIT", 
"532538" : "ULTRATECH CEMENT LTD", 
"500387" : "SHREE CEMENT LTD", 
"500425" : "AMBUJA CEMENTS LTD", 
"532689" : "PVR LTD", 
"532706" : "INOX LEISURE LTD", 
"532163" : "SAREGAMA INDIA LTD", 
"524715" : "SUN PHARMACEUTICAL INDUSTRIES LTD", 
"532488" : "DIVI'S LABORATORIES LTD",
"500124" : "DR.REDDY'S LABORATORIES LTD"}

In [22]:
models = ["GRNN"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [23]:
final_df = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\df_Final_Results.csv")
# final_df.drop('Unnamed: 0', inplace = True, axis = 'columns')
final_df

Unnamed: 0,Company,Model,Method,Percentage
0,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,KNN-Regression,KNN_Regression_FI_BackwardElimination_,0.540342
1,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,RNN-Regression,RNN_Regression_FI_AllFeaturesConsideration,0.513447
2,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Ridge Regression,RidgeFIFValue1,0.506112
3,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Linear Regression,LinearFIBackwardElimination,0.501222
4,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Lasso Regression,LassoFIFValue10,0.497555
...,...,...,...,...
219,542602-EMBASSY OFFICE PARKS REIT,Linear Regression,LinearFICoefficients0.1,0.538462
220,542602-EMBASSY OFFICE PARKS REIT,Ridge Regression,RidgeFICoefficients0.1,0.538462
221,542602-EMBASSY OFFICE PARKS REIT,KNN-Regression,KNN_Regression_FI_AllFeaturesConsideration_,0.530769
222,542602-EMBASSY OFFICE PARKS REIT,RNN-Regression,RNN_Regression_FI_BackwardElimination,0.507692


In [24]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

In [None]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr500085"):
        df_gnn = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[2 : 8]
        fd_df = pd.DataFrame(columns = final_df.columns)
        print("For stock : ", stock)
        print("#################################################################################################################")
        f_df = get_results_from_each_set(df_gnn, name, fd_df)
        final_df = final_df.append(f_df, ignore_index = True)
        print("#################################################################################################################")
        break
        
# final_df = final_df.sort_values(by = ['Company', 'Percentage'], ascending = [True, False])
# final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\df_Final_Results_GRNN.csv', index = False)

For stock :  500085
#################################################################################################################
Features Importance using Forward Selection Method
*****************************************************************************************
Features obtained from Forward Selection method : 
--------------------------------------
['Beta GR']
GRNN Model fitted using columns obtained from feature importance using ForwardSelection : 
Fitting 5 folds for each of 990 candidates, totalling 4950 fits
{'std': 0.30599999999999977}
@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@
0 818 0.0
Maximum percentage of correct direction :  0.0
*****************************************************************************************
Features Importance using Backward Elimination Method
*****************************************************************************************
Features obtained from Backward Elimination method : 
--------------------------------------
[

In [None]:
# %%time
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         filepath = os.path.join(dirname, filename)
#         df = pd.read_csv(filepath)
#         df = pre_process_data(df,60)
#         column = "Next Day Close Price GR"
#         (df,column) = dependent_column(df,column)
#         X = df.drop(columns=[column])
#         Y = df[column]
        
#         resultdf = pd.DataFrame(result)
#         resultdf.to_csv(os.path.join(os.getcwd(),str(filename[2:8])+"_gnn"+".csv"),index=None)

In [None]:
for name,table in tables.items():
    print(table)

In [None]:
final_df

In [None]:
# sklearn.metrics.SCORERS.keys()