In [90]:
# %pip install mlxtend

In [2]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from prettytable import PrettyTable
import time 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.patches as patches
import warnings; warnings.simplefilter('ignore')
import sys
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from prettytable import PrettyTable

In [3]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

# PreProcessing Data

In [4]:
def pre_process_data(data,null_threshold):
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if ((null_threshold * total / 100) < data[col].isnull().sum()):
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column

In [5]:
def dependent_column(data,column):
    cols = [col for col in data.columns if ("next" not in col.lower() and col.lower().endswith("gr"))]
    cols.append(column)
    data = data[cols]
    return (data,column)

# For predicting UB, LB percentage columns

In [16]:
def dependent_column_UBLB(data,column):
    cols = [col for col in data.columns if (not(column == col) and "next" not in col.lower())]
    cols.append(column)
    data = data[cols]
    return (data,column)

# OLS Regression

In [95]:
def OLS_Regression(X_train,Y_train):
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
#     print(list(zip(list(cols),ols_model.pvalues)))
    rsquared_adj = ols_model.rsquared_adj
    aic = ols_model.aic
    bic = ols_model.bic
    fvalue = ols_model.fvalue
    return {"rsquared_adj":rsquared_adj,"aic":aic,"bic":bic,"fvalue":fvalue}

# Linear Regression

In [96]:
def linear_regression(data, y):
    # print("------ Linear Regression ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 0)
    model = LinearRegression(fit_intercept = True)  
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    confidence = model.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values, "Confidence" : confidence, "Predicted" : pred, "Actual" : Y_test}

# linear regression with forward selection

In [97]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [98]:
def linear_regression_forward_selection(data,y):
    # print("------ Linear Regression Forward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection : ")
    print(forward_features)
    return linear_regression(data[forward_features+[y]],y)

# linear regression with backward elimination

In [99]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

In [100]:
def linear_regression_backward_selection(data,y):
    # print("------ Linear Regression Backward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination : ")
    print(backward_features)
    return linear_regression(data[backward_features+[y]],y)

# Using Inbuilt Forward Selection Method

In [101]:
def forward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=True,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [102]:
def linear_regression_forward_selection_inbuit(data,y):
    # print("------ Linear Regression Forward Selection Inbuilt ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = forward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("forwardFeatures.csv",index=None)
    return df

# Using Inbuilt Backward Elimination Method

In [103]:
def backward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=False,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [104]:
def linear_regression_backward_selection_inbuit(data,y):
    # print("------ Linear Regression Backward Selection Inbuilt ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = backward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("backwardFeatures.csv",index=None)
    return df

# Ridge Regression

In [105]:
def bestparams_ridge(alpha,X_train,Y_train):
    
    ridge = Ridge(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [106]:
def ridge_regression(data,y):
    
    # print("------ Ridge Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_ridge(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_ridge(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Ridge regression with the above best alpha value and the train datasets.
    clf = Ridge(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)

    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values, "Confidence" : confidence, "Predicted" : pred, "Actual" : Y_test}

# Lasso Regression

In [107]:
def bestparams_lasso(alpha,X_train,Y_train):
    
    lasso = Lasso(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [108]:
def lasso_regression(data,y):
    
    # print("------ Lasso Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_lasso(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_lasso(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Lasso regression with the above best alpha value and the train datasets.
    clf = Lasso(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values, "Confidence" : confidence, "Predicted" : pred, "Actual" : Y_test}

# Elastic Regression

In [109]:
def bestparams_elastic(alphas,l1,X_train,Y_train):
    
    elastic_net = ElasticNet(alpha=1, l1_ratio=0.2).fit(X_train, Y_train)
    param_grid = dict(alpha=alphas, l1_ratio=l1)
    
    grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2')
    
    grid_result = grid.fit(X_train, Y_train)
    
    alpha_val = grid_result.best_estimator_.alpha
    l1_val = grid_result.best_estimator_.l1_ratio
    
    return (alpha_val,l1_val)

In [110]:
def elastic_net_regression(data,y):

    # print("------ Elastic Net Regression ------")
    
    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    
    alpha = np.array([0,0.1,0.001,0.0001,1])
    l1_ratio = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    
    best = bestparams_elastic(alpha,l1_ratio,X_train,Y_train)
#     print("Best Alpha:", best[0]) # best alpha value
    # print("Best l1 - value:", best[1])
    
    # Re-selecting the alpha value based on the above selected alpha value
    
    if (best[0] == 0):
        clf = ElasticNet(alpha=best[0],l1_ratio = best[1])
        clf.fit(X_train, Y_train)
    else:
        alpha1 = np.arange(best[0]/10,best[0]*10)
        best_alpha = bestparams_elastic(alpha1,l1_ratio,X_train,Y_train)
        # print("Best Alpha after tuning : ", best_alpha[0])
        # print("Best l1 after tuning : ", best_alpha[1])
        clf = ElasticNet(alpha=best_alpha[0],l1_ratio = best_alpha[1])
        clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    
    # coeff_vs_Regularization(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values, "Confidence" : confidence, "Predicted" : pred, "Actual" : Y_test}
    

In [111]:
def coeff_vs_Regularization(X_train,Y_train):
    coefs = []
    n_alphas = 200
    alphas = np.logspace(-10, -2, n_alphas)

    for a in alphas:
        elastic = ElasticNet(alpha=a)
        elastic.fit(X_train, Y_train)
        coefs.append(elastic.coef_)
    
    ax = plt.gca()

    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha(log scale)')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet - Coefficients Vs Regularization')
    plt.axis('tight')
    plt.show()

# Finding results from each set of important features

In [112]:
columns = ['Company','Method','Percentage', 'RMSE', 'MAE', 'MSE','Confidence', 'rsquared_adj']

In [113]:
# companies = {"500112" : "SBIN" ,
# "500325" : "RELIANCE INDUSTRIES LTD",
# "532540" : "TATA CONSULTANCY SERVICES LTD" ,
# "500209" : "INFOSYS LTD", 
# "532174" : "ICICI BANK LTD", 
# "507685" : "WIPRO LTD", 
# "530965" : "INDIAN OIL CORPORATION LTD", 
# "500182" : "HERO MOTOCORP LTD", 
# "532210" : "CITY UNION BANK LTD", 
# "500180" : "HDFC Bank Ltd",
# "500680" : "PFIZER LTD", 
# "506395" : "COROMANDEL iNTERNATIONAL LTD",
# "500770" : "TATA CHEMICALS LTD", 
# "500085" : "CHAMBAL FERTILISERS & CHEMICALS LTD", 
# "501425" : "BOMBAY BURMAH TRADING CORP.LTD", 
# "532899" : "KAVERI SEED COMPANY LTD", 
# "537291" : "NATH BIO-GENES (INDIA) LTD", 
# "500790" : "NESTLE INDIA LTD", 
# "500825" : "BRITANNIA INDUSTRIES LTD", 
# "533155" : "JUBILANT FOODWORKS LTD", 
# "533287" : "ZEE LEARN LTD", 
# "533260" : "CAREER POINT LTD", 
# "539921" : "SHANTI EDUCATIONAL INITIATIVES LTD", 
# "542602" : "EMBASSY OFFICE PARKS REIT", 
# "543217" : "MINDSPACE BUSINESS PARKS REIT", 
# "543261" : "BROOKFIELD INDIA REAL ESTATE TRUST REIT", 
# "532538" : "ULTRATECH CEMENT LTD", 
# "500387" : "SHREE CEMENT LTD", 
# "500425" : "AMBUJA CEMENTS LTD", 
# "532689" : "PVR LTD", 
# "532706" : "INOX LEISURE LTD", 
# "532163" : "SAREGAMA INDIA LTD", 
# "524715" : "SUN PHARMACEUTICAL INDUSTRIES LTD", 
# "532488" : "DIVI'S LABORATORIES LTD",
# "500124" : "DR.REDDY'S LABORATORIES LTD"}

In [114]:
df_equity = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Equity.csv")
security_numbers = df_equity["Security Code"].tolist()
security_names = df_equity["Security Name"].tolist()
companies = {str(k) : v for (k, v) in list(zip(security_numbers, security_names))}
companies["542602"] = "Embassy Office Parks REIT"
# companies["542602"]

In [115]:
models = ["Linear Regression","Lasso Regression","Ridge Regression","Elastic Regression"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [116]:
final_columns = ['Company', 'Model', 'Method', 'Percentage']

In [117]:
final_df = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\All_Companies_Final_Results.csv")
final_df

Unnamed: 0.1,Unnamed: 0,Company,Model,Method,Percentage
0,0,500002-ABB India Limited,Linear Regression,LinearFIFValue1,0.541713
1,1,500002-ABB India Limited,Ridge Regression,RidgeFIFValue1,0.540601
2,2,500002-ABB India Limited,Lasso Regression,LassoFIFValue1,0.537264
3,3,500002-ABB India Limited,Elastic Net Regression,ElasticFIFValue1,0.536151
4,6,500003-AEGIS LOGISTICS LTD.,Lasso Regression,LassoFIFValue1000,0.539299
...,...,...,...,...,...
607,606,537291-Nath Bio-Genes (India) Ltd,Lasso Regression,LassoFIForwardSelection,0.547170
608,608,539921-Shanti Educational Initiatives Ltd,Linear Regression,LinearFIBackwardElimination,0.666667
609,609,539921-Shanti Educational Initiatives Ltd,Ridge Regression,RidgeFIBackwardElimination,0.666667
610,611,539921-Shanti Educational Initiatives Ltd,Elastic Net Regression,ElasticFIForwardSelection,0.538462


In [118]:
def create_pretty_table(name,model,result, method, percentage):
    values = [name[2 : 8 ] + "-" + companies[name[2 : 8]], method, round(percentage, 6)] + [round(v, 6) for k,v in result.items() if not isinstance(v,dict)] + [round(v, 6) for v in result["OLS"].values()]
    tables[model].add_row(values)
    tables[model].title = model

In [119]:
def fit_model(models, df, column, method, value, name, results):
    for model in models:
        if (model == "Linear"):
            model_result = linear_regression(df, column)
        elif (model == "Ridge"):
            model_result = ridge_regression(df, column)
        elif (model == "Lasso"):
            model_result = lasso_regression(df, column)
        else:
            model_result = elastic_net_regression(df, column)
    
        print(model + " Model fitted using columns obtained from feature importance using " + method + " : ")
        pred = model_result['Predicted']
        actual = model_result['Actual']
        pred_actual = pd.DataFrame(list(zip(pred, actual)), 
                   columns =['Predicted Values', 'Actual Values'])
        pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\" + name[2:8] + "_sd_all_" + model + "FI" + method + str(value) + ".csv" , index=False) 
        same_dir = 0
        diff_dir = 0
        
#         print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        for a, b in zip(pred, actual) :
#             if (a * b > 0):
            if (a > 0 and b > 0) or (a < 0 and b < 0):
#                 print(a, b)
                same_dir += 1
            else:
                diff_dir += 1
#         print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        
        print("Values in Same direction -----> ----->", same_dir)
        print("Values in Opposite direction <----- -----> ", diff_dir)
        print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
        percentage = (same_dir / (same_dir + diff_dir))
        results[model + "FI" + method + str(value)] = (same_dir / (same_dir + diff_dir))
        del model_result['Predicted']
        del model_result['Actual']
        del model_result['OLS']['aic']
        del model_result['OLS']['bic']
        del model_result['OLS']['fvalue']
        create_pretty_table(name ,model + " Regression" ,model_result, method + " " + value, percentage)

In [120]:
def get_results_from_FI_Coeffiecients(df, name, column, results):
    print("Features Importance using Coefficients")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    model_linear = LinearRegression(fit_intercept=True)
    model_linear.fit(X, Y)
    col_coef = list(df.columns)
    res_coef = [round(i,6) for i in list(model_linear.coef_)]
    rc_coef = list(zip(col_coef, res_coef))
    coef_features = []
    coef = [0.1]
    method = "Coefficients"
    models = ["Linear", "Ridge", "Lasso", "Elastic"]
    for cf in coef:
        for i in range(len(rc_coef)):
            if ((abs(rc_coef[i][1])) > cf):
                coef_features.append(rc_coef[i][0])
        print("Features obtained from coefficients greater than " + str(cf) + " : ")
        print("--------------------------------------")
        print(coef_features)
        if (len(coef_features) == 0):
            continue
        coef_features.append(column)
        df_fic = df[coef_features]
        fit_model(models, df_fic, column, method, str(cf), name, results)
    print("*****************************************************************************************")

In [121]:
def get_results_from_FI_PValue(df, name, column, results):
    print("Features Importance using p-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    col_pval = list(df.columns)
    pvals = list(ols_model.pvalues)
    pvals_cols = list(zip(col_pval, pvals))
    p = [0.02, 0.05, 0.1, 0.2]
    method = "PValue"
    models = ["Linear", "Ridge", "Lasso", "Elastic"]
    for pv in p:
        pval_features = []
        for i in range(len(pvals_cols)):
            if (pvals_cols[i][1] < pv):
                pval_features.append(pvals_cols[i][0])
        print("Features obtained from p-values less than " + str(pv) + " : ")
        print("-------------------------------------------------")
        print(pval_features)
        if (len(pval_features) == 0):
            continue
        pval_features.append(column)
        df_fip = df[pval_features]
        fit_model(models, df_fip, column, method, str(pv), name, results)
    print("*****************************************************************************************")
    

In [122]:
def get_results_from_FI_FValues(df, name, column, results):
    print("Features Importance using f-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    fval_cols = X.columns
    freg_res = f_regression(X, Y)
#     print(freg_res[0])
    fvals = freg_res[0]
    fc = list(zip(fval_cols, fvals))
    f = [1, 10, 100, 1000]
    method = "FValue"
    models = ["Linear", "Ridge", "Lasso", "Elastic"]
    for fv in f :
        fval_features = []
        for i in range(len(fc)):
            if ((abs(fc[i][1])) > fv):
                fval_features.append(fc[i][0])
        print("Features obtained from f-values greater than " + str(fv) + " : ")
        print("--------------------------------------")
        print(fval_features)
        if (len(fval_features) == 0):
            continue
        fval_features.append(column)
        df_fif = df[fval_features]
        fit_model(models, df_fif, column, method, str(fv), name, results)
    print("*****************************************************************************************")


In [123]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    models = ["Linear", "Ridge", "Lasso", "Elastic"]
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_model(models, df_fs, column, method, '', name, results)
    print("*****************************************************************************************")

In [124]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    models = ["Linear", "Ridge", "Lasso", "Elastic"]
    X = df1[df1.columns[:-1]]
    Y = df1[column].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_model(models, df_be, column, method, '', name, results)
#     lfs_res = linear_regression_backward_selection(df1, column)
#     pred = lfs_res['Predicted']
#     actual = lfs_res['Actual']
#     pred_actual = pd.DataFrame(list(zip(pred, actual)), 
#                columns =['Predicted Values', 'Actual Values'])
#     pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFI_BE_" + name + ".csv" , index=False) 
#     same_dir = 0
#     diff_dir = 0
#     for a, b in zip(pred, actual) :
#         if (a * b > 0):
#             same_dir += 1
#         else:
#             diff_dir += 1
#     print("Values in Same direction -----> ----->", same_dir)
#     print("Values in Opposite direction <----- -----> ", diff_dir)
#     print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
#     results["FI_BE"] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")

In [125]:
def get_results_from_each_set(data, name, final_df):
    df = pre_process_data(data, 60)
    column = "Next Day Close Price GR"
    (df1, column) = dependent_column(df, column)
    results = {}
    get_results_from_FI_Coeffiecients(df1, name, column, results)
    get_results_from_FI_PValue(df1, name, column, results)
    get_results_from_FI_ForwardSelection(df1, name, column, results)
    get_results_from_FI_BackwardElimination(df1, name, column, results)
    get_results_from_FI_FValues(df1, name, column, results)
#     print(results)
#     print(len(results))
    linear = {k : v for (k, v) in results.items() if ("Linear" in k)}
    ridge = {k : v for (k, v) in results.items() if ("Ridge" in k)}
    lasso = {k : v for (k, v) in results.items() if ("Lasso" in k)}
    elastic = {k : v for (k, v) in results.items() if ("Elastic" in k)}
    sorted_results = sorted(results.items(), key=lambda item: item[1])
    sorted_linear = sorted(linear.items(), key=lambda item: item[1])
    sorted_ridge = sorted(ridge.items(), key=lambda item: item[1])
    sorted_lasso = sorted(lasso.items(), key=lambda item: item[1])
    sorted_elastic = sorted(elastic.items(), key=lambda item: item[1])
    linear_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'Linear Regression', 'Method' : sorted_linear[-1][0], 'Percentage' : sorted_linear[-1][1]}
    ridge_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'Ridge Regression', 'Method' : sorted_ridge[-1][0], 'Percentage' : sorted_ridge[-1][1]}
    lasso_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'Lasso Regression', 'Method' : sorted_lasso[-1][0], 'Percentage' : sorted_lasso[-1][1]}
    elastic_row = {'Company' : name[2 : 8] + "-" + companies[name[2 : 8]], 'Model' : 'Elastic Net Regression', 'Method' : sorted_elastic[-1][0], 'Percentage' : sorted_elastic[-1][1]}
    final_df = final_df.append(linear_row, ignore_index = True)
    final_df = final_df.append(ridge_row, ignore_index = True)
    final_df = final_df.append(lasso_row, ignore_index = True)
    final_df = final_df.append(elastic_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with a percentage of {}.".format(sorted_results[-1][0], sorted_results[-1][1]))
    print("Maximum correct direction values for Linear Model are obtained for {} with a percentage of {}.".format(sorted_linear[-1][0], sorted_linear[-1][1]))
    print("Maximum correct direction values for Ridge Model are obtained for {} with a percentage of {}.".format(sorted_ridge[-1][0], sorted_ridge[-1][1]))
    print("Maximum correct direction values for Lasso Model are obtained for {} with a percentage of {}.".format(sorted_lasso[-1][0], sorted_lasso[-1][1]))
    print("Maximum correct direction values for Elastic Model are obtained for {} with a percentage of {}.".format(sorted_elastic[-1][0], sorted_elastic[-1][1]))
    return final_df

In [126]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if (filename.startswith("gr542602")):
        df_linear = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[2 : 8]
#         orig_stdout = sys.stdout
#         sys.stdout = open("gr" + stock + "res.txt", "w")
        fd_df = pd.DataFrame(columns = final_columns)
        print("For stock : ", stock)
        print("#################################################################################################################")
        f_df = get_results_from_each_set(df_linear, name, fd_df)
        final_df = final_df.append(f_df, ignore_index = True)
        print("#################################################################################################################")
#         sys.stdout.close()
#         sys.stdout = orig_stdout
final_df = final_df.sort_values(by = ['Company', 'Percentage'], ascending = [True, False])
final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Models_Results\\All_Companies_Final_Results.csv') 

For stock :  542602
#################################################################################################################
Features Importance using Coefficients
*****************************************************************************************
Features obtained from coefficients greater than 0.1 : 
--------------------------------------
['Open Price GR', 'WAP GR']
Linear Model fitted using columns obtained from feature importance using Coefficients : 
Values in Same direction -----> -----> 70
Values in Opposite direction <----- ----->  60
Percentage of correct direction :  0.5384615384615384
Ridge Model fitted using columns obtained from feature importance using Coefficients : 
Values in Same direction -----> -----> 70
Values in Opposite direction <----- ----->  60
Percentage of correct direction :  0.5384615384615384
Lasso Model fitted using columns obtained from feature importance using Coefficients : 
Values in Same direction -----> -----> 70
Values in Opposite di

Lasso Model fitted using columns obtained from feature importance using FValue : 
Values in Same direction -----> -----> 64
Values in Opposite direction <----- ----->  66
Percentage of correct direction :  0.49230769230769234
Elastic Model fitted using columns obtained from feature importance using FValue : 
Values in Same direction -----> -----> 64
Values in Opposite direction <----- ----->  66
Percentage of correct direction :  0.49230769230769234
Features obtained from f-values greater than 100 : 
--------------------------------------
[]
Features obtained from f-values greater than 1000 : 
--------------------------------------
[]
*****************************************************************************************
Maximum correct direction values are obtained for ElasticFIPValue0.1 with a percentage of 0.5692307692307692.
Maximum correct direction values for Linear Model are obtained for LinearFICoefficients0.1 with a percentage of 0.5384615384615384.
Maximum correct direction

In [127]:
for name,table in tables.items():
    print(table)

+-----------------------------------------------------------------------------------------------------------------------------------+
|                                                         Linear Regression                                                         |
+----------------------------------+----------------------+------------+----------+----------+----------+------------+--------------+
|             Company              |        Method        | Percentage |   RMSE   |   MAE    |   MSE    | Confidence | rsquared_adj |
+----------------------------------+----------------------+------------+----------+----------+----------+------------+--------------+
| 542602-Embassy Office Parks REIT |   Coefficients 0.1   |  0.538462  | 0.019277 | 0.01343  | 0.000372 |  0.049322  |   0.016632   |
| 542602-Embassy Office Parks REIT |      PValue 0.1      |  0.469231  | 0.020741 | 0.014373 | 0.00043  | -0.100499  |   0.001318   |
| 542602-Embassy Office Parks REIT |      PValue 0.2      |  0

In [128]:
final_df

Unnamed: 0.1,Unnamed: 0,Company,Model,Method,Percentage
0,0.0,500002-ABB India Limited,Linear Regression,LinearFIFValue1,0.541713
1,1.0,500002-ABB India Limited,Ridge Regression,RidgeFIFValue1,0.540601
2,2.0,500002-ABB India Limited,Lasso Regression,LassoFIFValue1,0.537264
3,3.0,500002-ABB India Limited,Elastic Net Regression,ElasticFIFValue1,0.536151
4,6.0,500003-AEGIS LOGISTICS LTD.,Lasso Regression,LassoFIFValue1000,0.539299
...,...,...,...,...,...
611,610.0,539921-Shanti Educational Initiatives Ltd,Lasso Regression,LassoFIFValue10,0.495726
614,,542602-Embassy Office Parks REIT,Lasso Regression,LassoFIPValue0.1,0.569231
615,,542602-Embassy Office Parks REIT,Elastic Net Regression,ElasticFIPValue0.1,0.569231
612,,542602-Embassy Office Parks REIT,Linear Regression,LinearFICoefficients0.1,0.538462


In [39]:
# orig_stdout = sys.stdout
# sys.stdout = open("500112res.txt", "w")

In [40]:
# path = "C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Stock\\gr500112.csv"
# df_lin = pd.read_csv(path)
# name = path.split("\\")[-1]
# stock = name[2 : 8]
# print("For stock : ", stock)
# print("#################################################################################################################")
# get_results_from_each_set(df_lin, name)
# print("#################################################################################################################")

In [41]:
# sys.stdout.close()
# sys.stdout=orig_stdout