In [1]:
import csv
import numpy as np
import pandas as pd
import glob, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import *
from sklearn.svm import SVR
from sklearn import svm
from sklearn.metrics import mean_absolute_error,mean_squared_error
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score

from prettytable import PrettyTable

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
import math

In [2]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

# Pre-processing the Data

In [3]:
def pre_process_data(data,null_threshold):
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if ((null_threshold * total / 100) < data[col].isnull().sum()):
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on the dependent column

In [4]:
def dependent_column(data,column):
    cols = [col for col in data.columns if ("next" not in col.lower() and col.lower().endswith("gr"))]
    cols.append(column)
    data = data[cols]
    return (data,column)

# For predicting UB, LB percentage columns

In [5]:
def remove_next_columns_UBLB(data):
    cols = [col for col in data.columns if ("next" not in col.lower())]
    data = data[cols]
    return data

In [6]:
def remove_CP_columns_UBLB(data):
    cols = [col for col in data.columns if ("cp" not in col.lower())]
    data = data[cols]
    return data

In [7]:
def remove_LowerBand_columns_UBLB(data, column):
    cols = [col for col in data.columns if ("lv" not in col.lower() and "low" not in col.lower())]
    cols.append(column)
    data = data[cols]
    return (data, column)

In [8]:
def remove_UpperBand_columns_UBLB(data, column):
    cols = [col for col in data.columns if ("hv" not in col.lower() and "upper" not in col.lower())]
    cols.append(column)
    data = data[cols]
    return (data, column)

# Feature Selection Methods

In [9]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [10]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

# ---

In [11]:
def Model(dataset,y):
    X = dataset.loc[:, dataset.columns != y]
    Y = dataset.loc[:, dataset.columns == y]
    count = 3
    step = 1000
    start = 1
    end = 10000
    while(count >= 0):
#         print("start  ------> " + str(start))
#         print("end    ------> " + str(end))
#         print("step   ------> " + str(step))
        best,step = bestParameters(start,end,step,X,Y)
        for key in best:
            if key == 'C':
                c = best[key]
                print(c)
        if(count == 3):
            start = c - 1000
            end = c + 1000
        if(count == 2):
            start = c - 100
            end = c + 100
        if(count == 1):
            start = c - 10
            end = c + 10
        step = step/10
        count = count - 1 
#         print("count : " , count)
    return c

In [12]:
def bestParameters(start,end,step,X,Y):
    parameters = {'kernel': ['rbf'], 'C': np.arange(start,end,step),  
              'gamma': [0.0001],'epsilon':[0.01]}
#     print("parameters ", parameters)
    svr = svm.SVR(gamma = 'auto')
    clf = GridSearchCV(svr, parameters, scoring = 'neg_mean_squared_error')
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    clf.fit(X_train,y_train)
    best = clf.best_params_
#     print("best ", best)
    return (best,step)

# Finding results from each set of important features

In [13]:
columns = ['Company','Model','Method', 'Y-Column', 'RMSE', 'MSE', 'MAE', 'EVS', 'r2_score']

In [14]:
df_equity = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Equity.csv")
security_numbers = df_equity["Security Code"].tolist()
security_names = df_equity["Security Name"].tolist()
companies = {str(k) : v for (k, v) in list(zip(security_numbers, security_names))}
companies["542602"] = "Embassy Office Parks REIT"
companies["500112"] = "State Bank of India"

In [15]:
models = ["SVR"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [16]:
final_columns = ['Company','Y-Column','Model','Method', 'RMSE','MAE','MSE','EVS', 'r2_score']

In [17]:
final_df = pd.DataFrame(columns = final_columns)
final_df

Unnamed: 0,Company,Y-Column,Model,Method,RMSE,MAE,MSE,EVS,r2_score


In [18]:
final_df.columns

Index(['Company', 'Y-Column', 'Model', 'Method', 'RMSE', 'MAE', 'MSE', 'EVS',
       'r2_score'],
      dtype='object')

In [19]:
def create_pretty_table(name, model, result, method, column):
    values = [name[10 : 16] + "-" + companies[name[10 : 16]], model, method, column] + [round(v, 6) for k,v in result.items() if not isinstance(v,dict)]
    tables[model].add_row(values)
    tables[model].title = model

In [20]:
def fit_SVR(df, column, method, value, name, results):
    print("SVR Model fitted using columns obtained from feature importance using " + method + " : ")
    c = Model(df, column)
    X = df[df.columns[:-1]]
    Y = df[column]
    
    scaler = StandardScaler()

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    X_train = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train)
    
    X_test = scaler.fit_transform(X_test)
    X_test = pd.DataFrame(X_test)
    
    y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
    y_train = pd.DataFrame(y_train)
    
    
    svr_rbf = SVR(kernel='rbf', C = 144, gamma = 0.0001, epsilon = 0.01)
    y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)
    
    y_rbf = y_rbf.astype(float)
    
    y_rbf = scaler.inverse_transform(y_rbf)
    

    pred_actual = pd.DataFrame(list(zip(y_rbf, y_test)), 
                   columns =['Predicted Values', 'Actual Values'])
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\UBLB_Results\\" + name[10 : 16] + "_SVR_" + column + "_FI_" + method + "_" + str(value) + ".csv" , index=False)
    
    
    r2_sc = r2_score(y_test, y_rbf)
    rmse = math.sqrt(metrics.mean_squared_error(y_test, y_rbf))
    mse = mean_squared_error(y_test, y_rbf)
    mae = mean_absolute_error(y_test, y_rbf)
    var = explained_variance_score(y_test, y_rbf)
    
    model_result = {'RMSE' : rmse, 'MAE' : mae, 'MSE' : mse, 'EVS' : var, 'r2_score' : r2_sc}
    
    print("R2_Score ----> ", model_result["r2_score"])
    
    results["SVR_FI_" + method + "_" + str(value)] = {}
    results["SVR_FI_" + method + "_" + str(value)]['r2_score'] = model_result['r2_score']
    results["SVR_FI_" + method + "_" + str(value)]['RMSE'] = model_result['RMSE']
    results["SVR_FI_" + method + "_" + str(value)]['MAE'] = model_result['MAE']
    results["SVR_FI_" + method + "_" + str(value)]['MSE'] = model_result['MSE']
    results["SVR_FI_" + method + "_" + str(value)]['EVS'] = model_result['EVS']
    
    create_pretty_table(name , "SVR", model_result, method + " " + value, column)

In [21]:
def get_results_from_FI_Coeffiecients(df, name, column, results):
    print("Features Importance using Coefficients")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    model_linear = LinearRegression(fit_intercept=True)
    model_linear.fit(X, Y)
    col_coef = list(df.columns)
    res_coef = [round(i,6) for i in list(model_linear.coef_)]
    rc_coef = list(zip(col_coef, res_coef))
    coef_features = []
    coef = [0.1]
    method = "Coefficients"
    for cf in coef:
        for i in range(len(rc_coef)):
            if ((abs(rc_coef[i][1])) > cf):
                coef_features.append(rc_coef[i][0])
        print("Features obtained from coefficients greater than " + str(cf) + " : ")
        print("--------------------------------------")
        print(coef_features)
        if (len(coef_features) == 0):
            continue
        coef_features.append(column)
        df_fic = df[coef_features]
        fit_SVR(df_fic, column, method, str(cf), name, results)
    print("*****************************************************************************************")

In [22]:
def get_results_from_FI_PValue(df, name, column, results):
    print("Features Importance using p-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    col_pval = list(df.columns)
    pvals = list(ols_model.pvalues)
    pvals_cols = list(zip(col_pval, pvals))
    p = [0.02, 0.05, 0.1, 0.2]
    method = "PValue"
    for pv in p:
        pval_features = []
        for i in range(len(pvals_cols)):
            if (pvals_cols[i][1] < pv):
                pval_features.append(pvals_cols[i][0])
        print("Features obtained from p-values less than " + str(pv) + " : ")
        print("-------------------------------------------------")
        print(pval_features)
        if (len(pval_features) == 0):
            continue
        pval_features.append(column)
        df_fip = df[pval_features]
        fit_SVR(df_fip, column, method, str(pv), name, results)
    print("*****************************************************************************************")

In [23]:
def get_results_from_FI_FValues(df, name, column, results):
    print("Features Importance using f-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    fval_cols = X.columns
    freg_res = f_regression(X, Y)
#     print(freg_res[0])
    fvals = freg_res[0]
    fc = list(zip(fval_cols, fvals))
    f = [1, 10, 100, 1000]
    method = "FValue"
    for fv in f :
        fval_features = []
        for i in range(len(fc)):
            if ((abs(fc[i][1])) > fv):
                fval_features.append(fc[i][0])
        print("Features obtained from f-values greater than " + str(fv) + " : ")
        print("--------------------------------------")
        print(fval_features)
        if (len(fval_features) == 0):
            continue
        fval_features.append(column)
        df_fif = df[fval_features]
        fit_SVR(df_fif, column, method, str(fv), name, results)
    print("*****************************************************************************************")

In [24]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_SVR(df_fs, column, method, '', name, results)
    print("*****************************************************************************************")

In [25]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_SVR(df_be, column, method, '', name, results)
    print("*****************************************************************************************")

In [26]:
def get_results_from_each_set(data, name, final_df, column):
    df = pre_process_data(data, 60)
    
    df1 = remove_next_columns_UBLB(df)
    df1 = remove_CP_columns_UBLB(df1)
    if ("Lower" in column):
        (df1, column) = remove_LowerBand_columns_UBLB(df1, column)
    else:
        (df1, column) = remove_UpperBand_columns_UBLB(df1, column)
    
    results = {}
    get_results_from_FI_Coeffiecients(df1, name, column, results)
#     get_results_from_FI_PValue(df1, name, column, results)
#     get_results_from_FI_ForwardSelection(df1, name, column, results)
#     get_results_from_FI_BackwardElimination(df1, name, column, results)
#     get_results_from_FI_FValues(df1, name, column, results)

    sorted_results = sorted(results.items(), key=lambda item: item[1]['r2_score'])
    max_row = {'Company' : name[10 : 16] + "-" + companies[name[10 : 16]], 'Y-Column' : column, 'Model' : 'SVR', 
               'Method' : sorted_results[-1][0], 'RMSE' : sorted_results[-1][1]['RMSE'], 'MAE' : sorted_results[-1][1]['MAE'], 
               'MSE' : sorted_results[-1][1]['MSE'], 'EVS' : sorted_results[-1][1]['EVS'],'r2_score' : sorted_results[-1][1]['r2_score']}
    final_df = final_df.append(max_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with r2_score of {}.".format(sorted_results[-1][0], sorted_results[-1][1]['r2_score']))
    return final_df

In [27]:
%%time

for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("UBLBPASTgr500033"):
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[10 : 16]
        fd_df = pd.DataFrame(columns = final_df.columns)
        y_columns = ["LowerBandInLast3Months", "LowerBandInLast6Months", "LowerBandInLast12Months",
                    "LowerBandInLast24Months", "LowerBandInLast36Months", "UpperBandInLast3Months",
                    "UpperBandInLast6Months", "UpperBandInLast12Months", "UpperBandInLast24Months", "UpperBandInLast36Months"]
        for i in range(len(y_columns)):
            col = y_columns[i]
            df_svr = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
            print("For stock : ", stock, "and for Y-Column :", col)
            print("#################################################################################################################")
            f_df = get_results_from_each_set(df_svr, name, fd_df, col)
            final_df = final_df.append(f_df, ignore_index = True)
            print("#################################################################################################################")
            break
        break
        
# final_df = final_df.sort_values(by = ['Company', 'Y-Column','r2_score'], ascending = [True, True, False])
# final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\Results_SVR.csv') 

For stock :  500033 and for Y-Column : LowerBandInLast3Months
#################################################################################################################
Features Importance using Coefficients
*****************************************************************************************
Features obtained from coefficients greater than 0.1 : 
--------------------------------------
['Open Price', 'High Price', 'Close Price', '% Deli. Qty to Traded Qty', 'Spread Close-Open', 'Dividend Value', '% Return of Company', '% Return of SP500', 'Beta', 'Rate', '% YTD of Company', '% YTD of SP500', 'Alpha', 'Expenditure', 'Net Profit', 'EPS', 'year', 'Net Profit  last 2 quarters', 'EPS last 2 quarters', 'Net Profit  last 4 quarters', 'EPS last 4 quarters', 'Net Profit  last 8 quarters', 'EPS last 8 quarters', 'p/e', 'Open Price GR', 'High Price GR', 'WAP GR', 'No.of Shares GR', 'No. of Trades GR', 'Total Turnover (Rs.) GR', 'Deliverable Quantity GR', '% Deli. Qty to Traded Qty GR',

KeyboardInterrupt: 

In [28]:
for name,table in tables.items():
    print(table)

+---------+-------+--------+----------+------+-----+-----+-----+----------+
| Company | Model | Method | Y-Column | RMSE | MSE | MAE | EVS | r2_score |
+---------+-------+--------+----------+------+-----+-----+-----+----------+
+---------+-------+--------+----------+------+-----+-----+-----+----------+


In [29]:
final_df.tail(10)

Unnamed: 0,Company,Y-Column,Model,Method,RMSE,MAE,MSE,EVS,r2_score
