In [1]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from prettytable import PrettyTable
import time 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.patches as patches

In [2]:
path = os.getcwd()
path

'f:\\PracticumProject\\StockAnalysisTool'

# PreProcessing Data

In [3]:
def pre_process_data(data,null_threshold):
    # data.replace(0, np.nan, inplace=True)
    # print(data.shape)
    # print(data.isnull().sum().sum())
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    # data.Date = pd.to_datetime(data.Date)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column

In [4]:
def dependent_column(data,column):
    cols = [col for col in data.columns if "next" not in col.lower()]
    cols.append(column)
    data = data[cols]
    return (data,column)

# OLS Regression

In [5]:
def OLS_Regression(X_train,Y_train):
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    
    rsquared_adj = ols_model.rsquared_adj
    aic = ols_model.aic
    bic = ols_model.bic
    fvalue = ols_model.fvalue
    return {"rsquared_adj":rsquared_adj,"aic":aic,"bic":bic,"fvalue":fvalue}

# Linear Regression

In [6]:
def linear_regression(data,y):
    # print("------ Linear Regression ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model = LinearRegression(fit_intercept = True)  
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)

    confidence = model.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# linear regression with forward selection

In [7]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [8]:
def linear_regression_forward_selection(data,y):
    # print("------ Linear Regression Forward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    forward_features = forward_selection(X,Y)
    # print(forward_features)
    return linear_regression(data[forward_features+[y]],y)

# linear regression with backward elimination

In [9]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

In [10]:
def linear_regression_backward_selection(data,y):
    # print("------ Linear Regression Backward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    backward_features = backward_elimination(X,Y)
    # print(backward_features)
    return linear_regression(data[backward_features+[y]],y)

# Using Inbuilt Forward Selection Method

In [11]:
def forward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=True,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [12]:
def linear_regression_forward_selection_inbuit(data,y):
    # print("------ Linear Regression Forward Selection Inbuilt ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = forward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("forwardFeatures.csv",index=None)
    return df

# Using Inbuilt Backward Elimination Method

In [13]:
def backward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=False,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [14]:
def linear_regression_backward_selection_inbuit(data,y):
    # print("------ Linear Regression Backward Selection Inbuilt ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = backward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("backwardFeatures.csv",index=None)
    return df

# Ridge Regression

In [15]:
def bestparams_ridge(alpha,X_train,Y_train):
    
    ridge = Ridge(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [16]:
def ridge_regression(data,y):
    
    # print("------ Ridge Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_ridge(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_ridge(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Ridge regression with the above best alpha value and the train datasets.
    clf = Ridge(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)

    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Lasso Regression

In [17]:
def bestparams_lasso(alpha,X_train,Y_train):
    
    lasso = Lasso(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [18]:
def lasso_regression(data,y):
    
    # print("------ Lasso Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_lasso(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_lasso(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Lasso regression with the above best alpha value and the train datasets.
    clf = Lasso(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Elastic Regression

In [19]:
def bestparams_elastic(alphas,l1,X_train,Y_train):
    
    elastic_net = ElasticNet(alpha=1, l1_ratio=0.2).fit(X_train, Y_train)
    param_grid = dict(alpha=alphas, l1_ratio=l1)
    
    grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2')
    
    grid_result = grid.fit(X_train, Y_train)
    
    alpha_val = grid_result.best_estimator_.alpha
    l1_val = grid_result.best_estimator_.l1_ratio
    
    return (alpha_val,l1_val)

In [20]:
def elastic_net_regression(data,y):

    # print("------ Elastic Net Regression ------")
    
    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    
    alpha = np.array([0,0.1,0.001,0.0001,1])
    l1_ratio = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    
    best = bestparams_elastic(alpha,l1_ratio,X_train,Y_train)
    # print("Best Alpha:", best[0]) # best alpha value
    # print("Best l1 - value:", best[1])
    
    # Re-selecting the alpha value based on the above selected alpha value
    
    alpha1 = np.arange(best[0]/10,best[0]*10)
    best_alpha = bestparams_elastic(alpha1,l1_ratio,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha[0])
    # print("Best l1 after tuning : ", best_alpha[1])
    clf = ElasticNet(alpha=best_alpha[0],l1_ratio = best_alpha[1])
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    
    # coeff_vs_Regularization(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}
    

In [21]:
def coeff_vs_Regularization(X_train,Y_train):
    coefs = []
    n_alphas = 200
    alphas = np.logspace(-10, -2, n_alphas)

    for a in alphas:
        elastic = ElasticNet(alpha=a)
        elastic.fit(X_train, Y_train)
        coefs.append(elastic.coef_)
    
    ax = plt.gca()

    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha(log scale)')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet - Coefficients Vs Regularization')
    plt.axis('tight')
    plt.show()

# Results 

In [22]:
columns = ["company",'root_mean_squared_error', 'mean_absolute_error', 'mean_squared_error','rsquared_adj', 'aic', 'bic', 'fvalue']
models = ["linear_regression","linear_regression_forward_selection","linear_regression_backward_selection","lasso_regression","ridge_regression","elastic_net_regression"]

In [23]:
dfs = {}
dfs = {k:pd.DataFrame(columns=columns) for k in models}
def create_df(filename,model,result):
    values = [filename] + [v for k,v in result.items() if not isinstance(v,dict)] + list(result["OLS"].values())
    dfs[model].loc[dfs[model].shape[0]] = values

In [24]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        (data,column) = dependent_column(df,column)
        result = linear_regression(data,column)
        create_df(filename,linear_regression.__name__,result)
        result = linear_regression_forward_selection(data,column)
        create_df(filename,linear_regression_forward_selection.__name__,result)
        result = linear_regression_backward_selection(data,column)
        create_df(filename,linear_regression_backward_selection.__name__,result)
        result = ridge_regression(data,column)
        create_df(filename,ridge_regression.__name__,result)
        result = lasso_regression(data,column)
        create_df(filename,lasso_regression.__name__,result)
        result = elastic_net_regression(data,column)
        create_df(filename,elastic_net_regression.__name__,result)
        end = time.time()
        print(os.path.splitext(filename)[0],end-start)

gr500112 132.39638710021973
gr500180 116.28825426101685
gr500182 111.4531729221344
gr500209 125.7742691040039
gr500325 210.97310996055603
gr500680 88.80196833610535
gr507685 78.82978391647339
gr530965 83.41498255729675
gr532174 120.63194990158081
gr532210 65.7193398475647
gr532540 78.46716809272766
Wall time: 20min 12s


In [25]:
for name, df in dfs.items():
    df.to_csv(os.path.join(path,"Results\\"+name+".csv"),index=None)

In [33]:
for name, df in dfs.items():
    display(name,df)


'linear_regression'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.294745,0.026969,0.086875,0.98842,-10162.952522,-9592.670796,1786.611107
1,gr500180.csv,0.054732,0.015361,0.002996,0.925198,-9393.639962,-8823.503108,259.379178
2,gr500182.csv,0.016879,0.012354,0.000285,0.30215,-11309.288799,-10750.110759,10.260305
3,gr500209.csv,0.019788,0.013603,0.000392,0.481386,-8747.606875,-8177.228683,20.436857
4,gr500325.csv,0.021471,0.01499,0.000461,0.373816,-8424.807571,-7854.381181,13.506598
5,gr500680.csv,0.014956,0.010623,0.000224,0.315304,-11349.874905,-10792.844454,10.544937
6,gr507685.csv,0.171958,0.019054,0.02957,0.348073,-9884.733145,-9314.692988,12.142804
7,gr530965.csv,0.038992,0.017378,0.00152,0.372609,-8927.509662,-8357.517888,13.272032
8,gr532174.csv,0.13758,0.023669,0.018928,0.926628,-9487.679759,-8917.349789,265.326785
9,gr532210.csv,0.082219,0.018281,0.00676,0.247735,-9950.797829,-9383.852006,7.602691


'linear_regression_forward_selection'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.293622,0.026588,0.086214,0.987711,-10094.319438,-9851.526228,3913.207377
1,gr500180.csv,0.04136,0.013843,0.001711,0.912649,-9149.024182,-9041.770715,1150.283665
2,gr500182.csv,0.01628,0.011794,0.000265,0.006721,-10658.032496,-10612.846392,2.773713
3,gr500209.csv,0.019066,0.012865,0.000364,0.408498,-8555.254516,-8464.897575,91.426991
4,gr500325.csv,0.019056,0.013416,0.000363,0.255333,-8143.463445,-8047.451083,43.275441
5,gr500680.csv,0.014547,0.010349,0.000212,0.008076,-10678.801794,-10639.415803,3.38664
6,gr507685.csv,0.156637,0.018276,0.024535,0.141348,-9395.122559,-9316.107092,25.551253
7,gr530965.csv,0.038454,0.016596,0.001479,0.228406,-8579.261035,-8494.608792,42.186024
8,gr532174.csv,0.124765,0.021589,0.015566,0.901678,-8959.434773,-8880.379134,1372.662591
9,gr532210.csv,0.072279,0.016618,0.005224,0.057739,-9577.238388,-9487.425188,8.75544


'linear_regression_backward_selection'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.294303,0.026754,0.086614,0.987641,-10071.672863,-9766.769762,3098.383175
1,gr500180.csv,0.042053,0.01391,0.001768,0.918135,-9278.667651,-9137.544667,938.598393
2,gr500182.csv,0.016319,0.011831,0.000266,0.010959,-10665.006124,-10608.523493,3.32347
3,gr500209.csv,0.019072,0.012805,0.000364,0.412507,-8566.528446,-8459.229578,78.421206
4,gr500325.csv,0.019687,0.013915,0.000388,0.363253,-8456.798506,-8276.069352,38.366556
5,gr500680.csv,0.0145,0.010305,0.00021,0.009396,-10677.552555,-10615.660283,2.769428
6,gr507685.csv,0.158746,0.018466,0.0252,0.164559,-9435.527841,-9260.56502,14.267043
7,gr530965.csv,0.037971,0.016508,0.001442,0.241026,-8609.710925,-8502.48475,35.882384
8,gr532174.csv,0.123427,0.021365,0.015234,0.925215,-9499.92783,-9234.526755,552.200923
9,gr532210.csv,0.078248,0.017437,0.006123,0.201535,-9900.691786,-9743.518687,19.254164


'lasso_regression'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.288589,0.027576,0.083284,0.98842,-10162.952522,-9592.670796,1786.611107
1,gr500180.csv,0.047286,0.014849,0.002236,0.925198,-9393.639962,-8823.503108,259.379178
2,gr500182.csv,0.016836,0.012327,0.000283,0.30215,-11309.288799,-10750.110759,10.260305
3,gr500209.csv,0.019026,0.012917,0.000362,0.481386,-8747.606875,-8177.228683,20.436857
4,gr500325.csv,0.020917,0.01482,0.000438,0.373816,-8424.807571,-7854.381181,13.506598
5,gr500680.csv,0.014902,0.010594,0.000222,0.315304,-11349.874905,-10792.844454,10.544937
6,gr507685.csv,0.035956,0.013299,0.001293,0.348073,-9884.733145,-9314.692988,12.142804
7,gr530965.csv,0.045089,0.017239,0.002033,0.372609,-8927.509662,-8357.517888,13.272032
8,gr532174.csv,0.136801,0.023488,0.018715,0.926628,-9487.679759,-8917.349789,265.326785
9,gr532210.csv,0.052104,0.016364,0.002715,0.247735,-9950.797829,-9383.852006,7.602691


'ridge_regression'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.289119,0.027009,0.08359,0.98842,-10162.952522,-9592.670796,1786.611107
1,gr500180.csv,0.05365,0.015211,0.002878,0.925198,-9393.639962,-8823.503108,259.379178
2,gr500182.csv,0.016829,0.01227,0.000283,0.30215,-11309.288799,-10750.110759,10.260305
3,gr500209.csv,0.020455,0.014247,0.000418,0.481386,-8747.606875,-8177.228683,20.436857
4,gr500325.csv,0.020845,0.014759,0.000435,0.373816,-8424.807571,-7854.381181,13.506598
5,gr500680.csv,0.01491,0.010574,0.000222,0.315304,-11349.874905,-10792.844454,10.544937
6,gr507685.csv,0.03621,0.01383,0.001311,0.348073,-9884.733145,-9314.692988,12.142804
7,gr530965.csv,0.042681,0.017396,0.001822,0.372609,-8927.509662,-8357.517888,13.272032
8,gr532174.csv,0.13758,0.023668,0.018928,0.926628,-9487.679759,-8917.349789,265.326785
9,gr532210.csv,0.066783,0.017764,0.00446,0.247735,-9950.797829,-9383.852006,7.602691


'elastic_net_regression'

Unnamed: 0,company,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,aic,bic,fvalue
0,gr500112.csv,0.289796,0.026027,0.083982,0.98842,-10162.952522,-9592.670796,1786.611107
1,gr500180.csv,0.046168,0.014647,0.002132,0.925198,-9393.639962,-8823.503108,259.379178
2,gr500182.csv,0.016681,0.012151,0.000278,0.30215,-11309.288799,-10750.110759,10.260305
3,gr500209.csv,0.020525,0.014183,0.000421,0.481386,-8747.606875,-8177.228683,20.436857
4,gr500325.csv,0.020916,0.014819,0.000437,0.373816,-8424.807571,-7854.381181,13.506598
5,gr500680.csv,0.01481,0.010516,0.000219,0.315304,-11349.874905,-10792.844454,10.544937
6,gr507685.csv,0.141398,0.017915,0.019993,0.348073,-9884.733145,-9314.692988,12.142804
7,gr530965.csv,0.039406,0.017238,0.001553,0.372609,-8927.509662,-8357.517888,13.272032
8,gr532174.csv,0.136791,0.023507,0.018712,0.926628,-9487.679759,-8917.349789,265.326785
9,gr532210.csv,0.07459,0.016958,0.005564,0.247735,-9950.797829,-9383.852006,7.602691


In [26]:
features = {}

In [27]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        (data,column) = dependent_column(df,column)
        X = data[data.columns[:-1]]
        Y = data[column].values
        forward_features = forward_selection(X,Y)
        backward_features = backward_elimination(X,Y)
        feat = {"forward_features":forward_features,"backward_features":backward_features}
        end = time.time()
        name = os.path.splitext(filename)[0]
        features[name] = features.get(name,{})
        features[name].update(feat)
        end = time.time()
        print(name,end-start)

gr500112 56.04677224159241
gr500180 16.109879970550537
gr500182 7.648442268371582
gr500209 13.616867065429688
gr500325 15.065235614776611
gr500680 7.678414821624756
gr507685 12.010040998458862
gr530965 15.891726732254028
gr532174 17.464090585708618
gr532210 19.60629439353943
gr532540 19.156341791152954
Wall time: 3min 20s


In [28]:
for name,feature in features.items():
    print(name)
    print("forward_features",feature["forward_features"])
    print("backward_features",feature["backward_features"])
    print("---------------------------")

gr500112
forward_features ['Beta GR', 'Open Price GR', 'CP % LV 7 days', 'Spread High-Low GR', 'Low Price GR', 'WAP GR', 'CP % HV 7 days', 'Min Inc % in 90 days', 'CP % LV 90 days', 'Alpha GR', 'No.of Shares GR', 'Total Turnover (Rs.) GR', 'No. of Trades GR', 'No. of Trades', '% YTD of SP500', 'Spread Close-Open', 'Min Inc % in 180 days', 'Min Inc % in 365 days', '% Return of Company', 'Close Price GR', 'No.of Shares', 'Total Turnover (Rs.)', 'Beta', 'Open Price', 'Close Price', 'Low Price', 'Expenditure  last 4 quarters', 'EPS last 2 quarters', 'CP % LV 180 days', '% YTD of Company', 'Alpha', 'Close Price of SP500', 'CP % LV 365 days', 'Lower Band', 'High Price', 'CP % HV 365 days', 'Net Profit  last 8 quarters', 'Band Area', 'Upper Band', 'Avg Inc % in 90 days', 'Avg Inc % in 180 days', 'Spread High-Low', 'Rate', 'Max Dec % in 90 days', 'Avg Dec % in 90 days']
backward_features ['Open Price', 'High Price', 'Close Price', 'No. of Trades', 'Spread High-Low', 'Spread Close-Open', '% Ret

# Feature Importance

In [29]:
mycols = ["company",'Open Price', 'High Price', 'Low Price','WAP','No.of Shares', 'No. of Trades', 'Total Turnover (Rs.)',
       'Deliverable Quantity', '% Deli. Qty to Traded Qty', 'Spread High-Low','Spread Close-Open']
feature_importance = pd.DataFrame(columns=mycols)

In [30]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        df = pre_process_data(df,60)
        name = os.path.splitext(filename)[0]
        cols = ['Open Price', 'High Price', 'Low Price','WAP','No.of Shares', 'No. of Trades', 'Total Turnover (Rs.)',
       'Deliverable Quantity', '% Deli. Qty to Traded Qty', 'Spread High-Low','Spread Close-Open']
        depd = "Close Price"
        df.drop
        X = df[cols]
        Y = df[depd]
        model = LinearRegression(fit_intercept=True)
        model.fit(X, Y)
        data = [name] + [round(i,6) for i in list(model.coef_)]
        feature_importance.loc[feature_importance.shape[0]] = data
feature_importance.to_csv("feature_importance.csv",index=None)

Wall time: 2.17 s


In [31]:
feature_importance

Unnamed: 0,company,Open Price,High Price,Low Price,WAP,No.of Shares,No. of Trades,Total Turnover (Rs.),Deliverable Quantity,% Deli. Qty to Traded Qty,Spread High-Low,Spread Close-Open
0,gr500112,0.875946,-0.04116,0.100697,0.064066,-8e-06,0.000416,-0.0,5e-06,0.140324,0.724055,9.185882
1,gr500180,0.140703,0.125933,0.116718,0.616187,-1e-05,-0.00031,-0.0,2.3e-05,-0.028889,0.320387,3.203855
2,gr500182,1.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1.0
3,gr500209,-0.172014,0.295981,0.28142,0.594101,1e-06,-5.4e-05,0.0,-2e-06,-0.002392,0.00245,0.037702
4,gr500325,-0.239092,0.457957,0.387422,0.392495,1e-06,-1.5e-05,-0.0,-0.0,0.007578,0.011886,0.118557
5,gr500680,1.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,1.0
6,gr507685,0.187928,0.142687,0.097953,0.570566,0.0,3e-06,-0.0,0.0,0.002868,0.004435,0.153178
7,gr530965,-0.046876,0.124065,0.115081,0.807596,0.0,-3e-05,-0.0,-0.0,0.002315,0.000953,0.059824
8,gr532174,0.668908,0.072728,0.026695,0.230866,-2e-06,0.000111,0.0,1e-06,0.074222,-0.088357,3.38837
9,gr532210,0.927876,-0.059508,0.107953,0.023757,0.0,-2.1e-05,-0.0,-0.0,0.000138,0.072944,0.836977
