In [1]:
# %pip install seaborn

In [2]:
import csv
import numpy as np
import pandas as pd
import glob, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import *
from sklearn.svm import SVR
from sklearn import svm
from sklearn.metrics import mean_absolute_error,mean_squared_error
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score

from prettytable import PrettyTable

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
import statsmodels.api as sm

In [3]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

# Pre-processing the Data

In [4]:
def pre_process_data(data,null_threshold):
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if ((null_threshold * total / 100) < data[col].isnull().sum()):
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on the dependent column

In [5]:
def dependent_column(data,column):
    cols = [col for col in data.columns if ("next" not in col.lower() and col.lower().endswith("gr"))]
    cols.append(column)
    data = data[cols]
    return (data,column)

In [6]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [7]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

# ---

In [8]:
def Model(dataset,y):
    X = dataset.loc[:, dataset.columns != y]
    Y = dataset.loc[:, dataset.columns == y]
    count = 3
    step = 1000
    start = 1
    end = 10000
    while(count >= 0):
#         print("start  ------> " + str(start))
#         print("end    ------> " + str(end))
#         print("step   ------> " + str(step))
        best,step = bestParameters(start,end,step,X,Y)
        for key in best:
            if key == 'C':
                c = best[key]
                print(c)
        if(count == 3):
            start = c - 1000
            end = c + 1000
        if(count == 2):
            start = c - 100
            end = c + 100
        if(count == 1):
            start = c - 10
            end = c + 10
        step = step/10
        count = count - 1 
#         print("count : " , count)
    return c

In [9]:
def bestParameters(start,end,step,X,Y):
    parameters = {'kernel': ['rbf'], 'C': np.arange(start,end,step),  
              'gamma': [0.0001],'epsilon':[0.01]}
#     print("parameters ", parameters)
    svr = svm.SVR(gamma = 'auto')
    clf = GridSearchCV(svr, parameters, scoring = 'neg_mean_squared_error')
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    clf.fit(X_train,y_train)
    best = clf.best_params_
#     print("best ", best)
    return (best,step)

# Finding results from each set of important features

In [10]:
columns = ['Company','Method','Percentage', 'r^2_score', 'Mean_SE', 'Mean_AE', 'Median_AE','EVS']

In [11]:
companies = {"500112" : "SBIN" ,
"500325" : "RELIANCE INDUSTRIES LTD",
"532540" : "TATA CONSULTANCY SERVICES LTD" ,
"500209" : "INFOSYS LTD", 
"532174" : "ICICI BANK LTD", 
"507685" : "WIPRO LTD", 
"530965" : "INDIAN OIL CORPORATION LTD", 
"500182" : "HERO MOTOCORP LTD", 
"532210" : "CITY UNION BANK LTD", 
"500180" : "HDFC Bank Ltd",
"500680" : "PFIZER LTD", 
"506395" : "COROMANDEL iNTERNATIONAL LTD",
"500770" : "TATA CHEMICALS LTD", 
"500085" : "CHAMBAL FERTILISERS & CHEMICALS LTD", 
"501425" : "BOMBAY BURMAH TRADING CORP.LTD", 
"532899" : "KAVERI SEED COMPANY LTD", 
"537291" : "NATH BIO-GENES (INDIA) LTD", 
"500790" : "NESTLE INDIA LTD", 
"500825" : "BRITANNIA INDUSTRIES LTD", 
"533155" : "JUBILANT FOODWORKS LTD", 
"533287" : "ZEE LEARN LTD", 
"533260" : "CAREER POINT LTD", 
"539921" : "SHANTI EDUCATIONAL INITIATIVES LTD", 
"542602" : "EMBASSY OFFICE PARKS REIT", 
"543217" : "MINDSPACE BUSINESS PARKS REIT", 
"543261" : "BROOKFIELD INDIA REAL ESTATE TRUST REIT", 
"532538" : "ULTRATECH CEMENT LTD", 
"500387" : "SHREE CEMENT LTD", 
"500425" : "AMBUJA CEMENTS LTD", 
"532689" : "PVR LTD", 
"532706" : "INOX LEISURE LTD", 
"532163" : "SAREGAMA INDIA LTD", 
"524715" : "SUN PHARMACEUTICAL INDUSTRIES LTD", 
"532488" : "DIVI'S LABORATORIES LTD",
"500124" : "DR.REDDY'S LABORATORIES LTD"}

In [12]:
models = ["SVR"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [13]:
final_df = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\df_Final_Results.csv")
final_df.drop('Unnamed: 0', inplace = True, axis = 'columns')
final_df

Unnamed: 0,Company,Model,Method,Percentage
0,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Ridge Regression,RidgeFIFValue1,0.506112
1,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Linear Regression,LinearFIBackwardElimination,0.501222
2,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Lasso Regression,LassoFIFValue10,0.497555
3,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,Elastic Net Regression,ElasticFIFValue10,0.497555
4,500112-SBIN,Linear Regression,LinearFIPValue0.05,0.528953
...,...,...,...,...
123,539921-SHANTI EDUCATIONAL INITIATIVES LTD,Lasso Regression,LassoFIFValue10,0.495726
124,542602-EMBASSY OFFICE PARKS REIT,Lasso Regression,LassoFIPValue0.1,0.569231
125,542602-EMBASSY OFFICE PARKS REIT,Elastic Net Regression,ElasticFIPValue0.1,0.569231
126,542602-EMBASSY OFFICE PARKS REIT,Linear Regression,LinearFICoefficients0.1,0.538462


In [14]:
final_df.columns

Index(['Company', 'Model', 'Method', 'Percentage'], dtype='object')

In [15]:
def create_pretty_table(name, model, result, method, percentage):
    values = [name[2 : 8 ] + "-" + companies[name[2 : 8]], method, round(percentage, 6)] + [round(v, 6) for k,v in result.items() if not isinstance(v,dict)]
    tables[model].add_row(values)
    tables[model].title = model

In [16]:
def fit_SVR(df, column, method, value, name, results):
    print("SVR Model fitted using columns obtained from feature importance using " + method + " : ")
    c = Model(df, column)
#     print(c)
    X = df.loc[:, df.columns != column]
    Y = df.loc[:, df.columns == column]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    svr_rbf = SVR(kernel='rbf', C = 144, gamma = 0.0001, epsilon = 0.01)
    y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)
    
    y_rbf = y_rbf.astype(float)
    y_test = y_test[column]

    pred_actual = pd.DataFrame(list(zip(y_rbf, y_test)), 
                   columns =['Predicted Values', 'Actual Values'])
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models_Results\\" + name[2:8] + "_SVR_" + "FI_" + method + "_" + str(value) + ".csv" , index=False)
    
    same_dir = 0
    diff_dir = 0
    for a, b in zip(y_rbf, y_test) :
        if (a > 0 and b > 0) or (a < 0 and b < 0):
            same_dir += 1
        else:
            diff_dir += 1
    percentage = (same_dir / (same_dir + diff_dir))
    print("Values in Same direction -----> ----->", same_dir)
    print("Values in Opposite direction <----- -----> ", diff_dir)
    print("Percentage of correct direction : ", percentage)
    results["SVR_FI_" + method + "_" + str(value)] = percentage
    
    r = r2_score(y_test, y_rbf)
    mse = mean_squared_error(y_test, y_rbf)
    mae = mean_absolute_error(y_test, y_rbf)
    med = median_absolute_error(y_test, y_rbf)
    var = explained_variance_score(y_test, y_rbf)
    
    model_result = {'r' : r, "mse" : mse, "mae" : mae, 'med_ae' : med, 'evs' : var}
    
    create_pretty_table(name , "SVR", model_result, method + " " + value, percentage)

In [17]:
def get_results_from_FI_Coeffiecients(df, name, column, results):
    print("Features Importance using Coefficients")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    model_linear = LinearRegression(fit_intercept=True)
    model_linear.fit(X, Y)
    col_coef = list(df.columns)
    res_coef = [round(i,6) for i in list(model_linear.coef_)]
    rc_coef = list(zip(col_coef, res_coef))
    coef_features = []
    coef = [0.1]
    method = "Coefficients"
    for cf in coef:
        for i in range(len(rc_coef)):
            if ((abs(rc_coef[i][1])) > cf):
                coef_features.append(rc_coef[i][0])
        print("Features obtained from coefficients greater than " + str(cf) + " : ")
        print("--------------------------------------")
        print(coef_features)
        if (len(coef_features) == 0):
            continue
        coef_features.append(column)
        df_fic = df[coef_features]
        fit_SVR(df_fic, column, method, str(cf), name, results)
    print("*****************************************************************************************")

In [18]:
def get_results_from_FI_PValue(df, name, column, results):
    print("Features Importance using p-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    col_pval = list(df.columns)
    pvals = list(ols_model.pvalues)
    pvals_cols = list(zip(col_pval, pvals))
    p = [0.02, 0.05, 0.1, 0.2]
    method = "PValue"
    for pv in p:
        pval_features = []
        for i in range(len(pvals_cols)):
            if (pvals_cols[i][1] < pv):
                pval_features.append(pvals_cols[i][0])
        print("Features obtained from p-values less than " + str(pv) + " : ")
        print("-------------------------------------------------")
        print(pval_features)
        if (len(pval_features) == 0):
            continue
        pval_features.append(column)
        df_fip = df[pval_features]
        fit_SVR(df_fip, column, method, str(pv), name, results)
    print("*****************************************************************************************")

In [19]:
def get_results_from_FI_FValues(df, name, column, results):
    print("Features Importance using f-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    fval_cols = X.columns
    freg_res = f_regression(X, Y)
#     print(freg_res[0])
    fvals = freg_res[0]
    fc = list(zip(fval_cols, fvals))
    f = [1, 10, 100, 1000]
    method = "FValue"
    for fv in f :
        fval_features = []
        for i in range(len(fc)):
            if ((abs(fc[i][1])) > fv):
                fval_features.append(fc[i][0])
        print("Features obtained from f-values greater than " + str(fv) + " : ")
        print("--------------------------------------")
        print(fval_features)
        if (len(fval_features) == 0):
            continue
        fval_features.append(column)
        df_fif = df[fval_features]
        fit_SVR(df_fif, column, method, str(fv), name, results)
    print("*****************************************************************************************")

In [20]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_SVR(df_fs, column, method, '', name, results)
    print("*****************************************************************************************")

In [21]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_SVR(df_be, column, method, '', name, results)
    print("*****************************************************************************************")

In [22]:
def get_results_from_each_set(data, name, final_df):
    df = pre_process_data(data, 60)
    column = "Next Day Close Price GR"
    (df1, column) = dependent_column(df, column)
    results = {}
    get_results_from_FI_Coeffiecients(df1, name, column, results)
    get_results_from_FI_PValue(df1, name, column, results)
    get_results_from_FI_ForwardSelection(df1, name, column, results)
    get_results_from_FI_BackwardElimination(df1, name, column, results)
    get_results_from_FI_FValues(df1, name, column, results)

    sorted_results = sorted(results.items(), key=lambda item: item[1])
    max_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'SVR', 'Method' : sorted_results[-1][0], 'Percentage' : sorted_results[-1][1]}
    final_df = final_df.append(max_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with a percentage of {}.".format(sorted_results[-1][0], sorted_results[-1][1]))
    return final_df

In [23]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        df_svr = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[2 : 8]
        fd_df = pd.DataFrame(columns = final_df.columns)
        print("For stock : ", stock)
        print("#################################################################################################################")
        f_df = get_results_from_each_set(df_svr, name, fd_df)
        final_df = final_df.append(f_df, ignore_index = True)
        print("#################################################################################################################")

final_df = final_df.sort_values(by = ['Company', 'Percentage'], ascending = [True, False])
final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\df_Final_Results.csv') 

For stock :  500085
#################################################################################################################
Features Importance using Coefficients
*****************************************************************************************
Features obtained from coefficients greater than 0.1 : 
--------------------------------------
[]
*****************************************************************************************
Features Importance using p-value
*****************************************************************************************
Features obtained from p-values less than 0.02 : 
-------------------------------------------------
[]
Features obtained from p-values less than 0.05 : 
-------------------------------------------------
[]
Features obtained from p-values less than 0.1 : 
-------------------------------------------------
[]
Features obtained from p-values less than 0.2 : 
-------------------------------------------------
['Deliverable Quan

1
1.0
41.0
39.0
Values in Same direction -----> -----> 434
Values in Opposite direction <----- ----->  464
Percentage of correct direction :  0.48329621380846327
Features obtained from f-values greater than 10 : 
--------------------------------------
['Open Price GR', 'High Price GR', 'Low Price GR', 'Close Price GR', 'WAP GR', 'Spread High-Low GR', 'Alpha GR', 'Beta GR']
SVR Model fitted using columns obtained from feature importance using FValue : 
1
1.0
51.0
53.0
Values in Same direction -----> -----> 435
Values in Opposite direction <----- ----->  463
Percentage of correct direction :  0.4844097995545657
Features obtained from f-values greater than 100 : 
--------------------------------------
['Open Price GR', 'High Price GR', 'Low Price GR', 'Close Price GR', 'WAP GR', 'Beta GR']
SVR Model fitted using columns obtained from feature importance using FValue : 
1001
401.0
331.0
339.0
Values in Same direction -----> -----> 453
Values in Opposite direction <----- ----->  445
Percenta

1
1.0
1.0
2.0
Values in Same direction -----> -----> 485
Values in Opposite direction <----- ----->  412
Percentage of correct direction :  0.5406911928651059
Features obtained from p-values less than 0.1 : 
-------------------------------------------------
['Low Price GR', 'Alpha GR', 'Beta GR']
SVR Model fitted using columns obtained from feature importance using PValue : 
1
1.0
1.0
2.0
Values in Same direction -----> -----> 473
Values in Opposite direction <----- ----->  424
Percentage of correct direction :  0.5273132664437012
Features obtained from p-values less than 0.2 : 
-------------------------------------------------
['Open Price GR', 'Low Price GR', 'Close Price GR', 'Alpha GR', 'Beta GR', 'Revenue GR']
SVR Model fitted using columns obtained from feature importance using PValue : 
1
1.0
1.0
1.0
Values in Same direction -----> -----> 474
Values in Opposite direction <----- ----->  423
Percentage of correct direction :  0.5284280936454849
************************************

1
1.0
1.0
1.0
Values in Same direction -----> -----> 450
Values in Opposite direction <----- ----->  450
Percentage of correct direction :  0.5
Features obtained from f-values greater than 10 : 
--------------------------------------
['Revenue GR', 'Net Profit GR', 'EPS GR']
SVR Model fitted using columns obtained from feature importance using FValue : 
1
1.0
1.0
1.0
Values in Same direction -----> -----> 463
Values in Opposite direction <----- ----->  437
Percentage of correct direction :  0.5144444444444445
Features obtained from f-values greater than 100 : 
--------------------------------------
[]
Features obtained from f-values greater than 1000 : 
--------------------------------------
[]
*****************************************************************************************
Maximum correct direction values are obtained for SVR_FI_Coefficients_0.1 with a percentage of 0.52.
#########################################################################################################

1
1.0
1.0
1.0
Values in Same direction -----> -----> 432
Values in Opposite direction <----- ----->  467
Percentage of correct direction :  0.48053392658509453
Features obtained from p-values less than 0.1 : 
-------------------------------------------------
['No. of Trades GR', 'Alpha GR', 'Beta GR', 'Revenue GR']
SVR Model fitted using columns obtained from feature importance using PValue : 
1
1.0
1.0
2.0
Values in Same direction -----> -----> 442
Values in Opposite direction <----- ----->  457
Percentage of correct direction :  0.4916573971078977
Features obtained from p-values less than 0.2 : 
-------------------------------------------------
['No. of Trades GR', 'Alpha GR', 'Beta GR', 'Revenue GR']
SVR Model fitted using columns obtained from feature importance using PValue : 
1
1.0
1.0
2.0
Values in Same direction -----> -----> 442
Values in Opposite direction <----- ----->  457
Percentage of correct direction :  0.4916573971078977
************************************************

For stock :  500680
#################################################################################################################
Features Importance using Coefficients
*****************************************************************************************
Features obtained from coefficients greater than 0.1 : 
--------------------------------------
['Low Price GR', 'WAP GR']
SVR Model fitted using columns obtained from feature importance using Coefficients : 
5001
5301.0
5291.0
5296.0
Values in Same direction -----> -----> 436
Values in Opposite direction <----- ----->  444
Percentage of correct direction :  0.4954545454545455
*****************************************************************************************
Features Importance using p-value
*****************************************************************************************
Features obtained from p-values less than 0.02 : 
-------------------------------------------------
['Low Price GR', 'Close Price GR', 'WAP GR']

KeyboardInterrupt: 

In [24]:
for name,table in tables.items():
    print(table)

+------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                   SVR                                                                    |
+--------------------------------------------+----------------------+------------+-----------+----------+----------+-----------+-----------+
|                  Company                   |        Method        | Percentage | r^2_score | Mean_SE  | Mean_AE  | Median_AE |    EVS    |
+--------------------------------------------+----------------------+------------+-----------+----------+----------+-----------+-----------+
| 500085-CHAMBAL FERTILISERS & CHEMICALS LTD |      PValue 0.2      |  0.469438  | -0.015414 | 0.000837 | 0.019307 |  0.013089 | -0.007934 |
| 500085-CHAMBAL FERTILISERS & CHEMICALS LTD |  ForwardSelection    |  0.474328  |  0.013863 | 0.000813 | 0.019153 |  0.012848 |  0.020605 |
| 500085-CHAM

In [26]:
final_df.tail(10)

Unnamed: 0,Company,Model,Method,Percentage
126,542602-EMBASSY OFFICE PARKS REIT,Linear Regression,LinearFICoefficients0.1,0.538462
127,542602-EMBASSY OFFICE PARKS REIT,Ridge Regression,RidgeFICoefficients0.1,0.538462
128,500085-CHAMBAL FERTILISERS & CHEMICALS LTD,SVR,SVR_FI_FValue_1,0.49511
129,500112-SBIN,SVR,SVR_FI_ForwardSelection_,0.505568
130,500124-DR.REDDY'S LABORATORIES LTD,SVR,SVR_FI_PValue_0.02,0.501134
131,500180-HDFC Bank Ltd,SVR,SVR_FI_FValue_1000,0.540691
132,500182-HERO MOTOCORP LTD,SVR,SVR_FI_Coefficients_0.1,0.52
133,500209-INFOSYS LTD,SVR,SVR_FI_FValue_1,0.518354
134,500325-RELIANCE INDUSTRIES LTD,SVR,SVR_FI_BackwardElimination_,0.497219
135,500387-SHREE CEMENT LTD,SVR,SVR_FI_Coefficients_0.1,0.5
