In [3]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from prettytable import PrettyTable
import time 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.patches as patches

In [4]:
path = os.getcwd()
path

'f:\\PracticumProject\\StockAnalysisTool'

# PreProcessing Data

In [5]:
def pre_process_data(data,null_threshold):
    # data.replace(0, np.nan, inplace=True)
    # print(data.shape)
    # print(data.isnull().sum().sum())
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    # data.Date = pd.to_datetime(data.Date)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column

In [6]:
def dependent_column(data,column):
    cols = [col for col in data.columns if "next" not in col.lower()]
    cols.append(column)
    data = data[cols]
    return (data,column)

# OLS Regression

In [7]:
def OLS_Regression(X_train,Y_train):
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    
    rsquared_adj = ols_model.rsquared_adj
    aic = ols_model.aic
    bic = ols_model.bic
    fvalue = ols_model.fvalue
    return {"rsquared_adj":rsquared_adj,"aic":aic,"bic":bic,"fvalue":fvalue}

# Linear Regression

In [8]:
def linear_regression(data,y):
    # print("------ Linear Regression ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model = LinearRegression(fit_intercept = True)  
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)

    confidence = model.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# linear regression with forward selection

In [9]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [10]:
def linear_regression_forward_selection(data,y):
    # print("------ Linear Regression Forward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    forward_features = forward_selection(X,Y)
    # print(forward_features)
    return linear_regression(data[forward_features+[y]],y)

# linear regression with backward elimination

In [11]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

In [12]:
def linear_regression_backward_selection(data,y):
    # print("------ Linear Regression Backward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    backward_features = backward_elimination(X,Y)
    # print(backward_features)
    return linear_regression(data[backward_features+[y]],y)

# Using Inbuilt Forward Selection Method

In [13]:
def forward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=True,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [14]:
def linear_regression_forward_selection_inbuit(data,y):
    # print("------ Linear Regression Forward Selection Inbuilt ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = forward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("forwardFeatures.csv",index=None)
    return df

# Using Inbuilt Backward Elimination Method

In [15]:
def backward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=False,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [16]:
def linear_regression_backward_selection_inbuit(data,y):
    # print("------ Linear Regression Backward Selection Inbuilt ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = backward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("backwardFeatures.csv",index=None)
    return df

# Ridge Regression

In [17]:
def bestparams_ridge(alpha,X_train,Y_train):
    
    ridge = Ridge(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [18]:
def ridge_regression(data,y):
    
    # print("------ Ridge Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_ridge(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_ridge(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Ridge regression with the above best alpha value and the train datasets.
    clf = Ridge(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)

    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Lasso Regression

In [19]:
def bestparams_lasso(alpha,X_train,Y_train):
    
    lasso = Lasso(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [20]:
def lasso_regression(data,y):
    
    # print("------ Lasso Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_lasso(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_lasso(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Lasso regression with the above best alpha value and the train datasets.
    clf = Lasso(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Elastic Regression

In [21]:
def bestparams_elastic(alphas,l1,X_train,Y_train):
    
    elastic_net = ElasticNet(alpha=1, l1_ratio=0.2).fit(X_train, Y_train)
    param_grid = dict(alpha=alphas, l1_ratio=l1)
    
    grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2')
    
    grid_result = grid.fit(X_train, Y_train)
    
    alpha_val = grid_result.best_estimator_.alpha
    l1_val = grid_result.best_estimator_.l1_ratio
    
    return (alpha_val,l1_val)

In [22]:
def elastic_net_regression(data,y):

    # print("------ Elastic Net Regression ------")
    
    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    
    alpha = np.array([0,0.1,0.001,0.0001,1])
    l1_ratio = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    
    best = bestparams_elastic(alpha,l1_ratio,X_train,Y_train)
    # print("Best Alpha:", best[0]) # best alpha value
    # print("Best l1 - value:", best[1])
    
    # Re-selecting the alpha value based on the above selected alpha value
    
    alpha1 = np.arange(best[0]/10,best[0]*10)
    best_alpha = bestparams_elastic(alpha1,l1_ratio,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha[0])
    # print("Best l1 after tuning : ", best_alpha[1])
    clf = ElasticNet(alpha=best_alpha[0],l1_ratio = best_alpha[1])
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    
    # coeff_vs_Regularization(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}
    

In [23]:
def coeff_vs_Regularization(X_train,Y_train):
    coefs = []
    n_alphas = 200
    alphas = np.logspace(-10, -2, n_alphas)

    for a in alphas:
        elastic = ElasticNet(alpha=a)
        elastic.fit(X_train, Y_train)
        coefs.append(elastic.coef_)
    
    ax = plt.gca()

    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha(log scale)')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet - Coefficients Vs Regularization')
    plt.axis('tight')
    plt.show()

# Pretty Table

In [24]:
columns = ["company",'root_mean_squared_error', 'mean_absolute_error', 'mean_squared_error','rsquared_adj', 'aic', 'bic', 'fvalue']

In [None]:
models = ["linear_regression","linear_regression_forward_selection","linear_regression_backward_selection","lasso_regression","ridge_regression","elastic_net_regression"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [None]:
def create_pretty_table(filename,model,result):
    values = [filename] + [v for k,v in result.items() if not isinstance(v,dict)] + list(result["OLS"].values())
    tables[model].add_row(values)
    tables[model].title = model

In [None]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        (data,column) = dependent_column(df,column)
        result = linear_regression(data,column)
        create_pretty_table(filename,linear_regression.__name__,result)
        result = linear_regression_forward_selection(data,column)
        create_pretty_table(filename,linear_regression_forward_selection.__name__,result)
        result = linear_regression_backward_selection(data,column)
        create_pretty_table(filename,linear_regression_backward_selection.__name__,result)
        result = ridge_regression(data,column)
        create_pretty_table(filename,ridge_regression.__name__,result)
        result = lasso_regression(data,column)
        create_pretty_table(filename,lasso_regression.__name__,result)
        result = elastic_net_regression(data,column)
        create_pretty_table(filename,elastic_net_regression.__name__,result)
        end = time.time()
        print(os.path.splitext(filename)[0],end-start)

In [131]:
for name,table in tables.items():
    print(table)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                               linear_regression                                                                               |
+--------------+-------------------------+----------------------+------------------------+---------------------+---------------------+---------------------+--------------------+
|   company    | root_mean_squared_error | mean_absolute_error  |   mean_squared_error   |     rsquared_adj    |         aic         |         bic         |       fvalue       |
+--------------+-------------------------+----------------------+------------------------+---------------------+---------------------+---------------------+--------------------+
| gr500112.csv |   0.29474546460059015   | 0.026969088628010125 |  0.08687488890261773   |  0.988419795310432 

In [None]:
features = {}

In [None]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        (data,column) = dependent_column(df,column)
        X = data[data.columns[:-1]]
        Y = data[column].values
        forward_features = forward_selection(X,Y)
        backward_features = backward_elimination(X,Y)
        feat = {"forward_features":forward_features,"backward_features":backward_features}
        end = time.time()
        name = os.path.splitext(filename)[0]
        features[name] = features.get(name,{})
        features[name].update(feat)
        end = time.time()
        print(name,end-start)

In [132]:
for name,feature in features.items():
    print(name)
    print("forward_features",feature["forward_features"])
    print("backward_features",feature["backward_features"])
    print("---------------------------")

gr500112
forward_features ['Beta GR', 'High Price GR', 'CP % LV 7 days', 'Spread High-Low GR', 'Low Price GR', 'CP % HV 7 days', 'WAP GR', 'Min Inc % in 90 days', 'CP % LV 90 days', 'Alpha GR', 'No.of Shares GR', 'Total Turnover (Rs.) GR', 'No. of Trades GR', 'No. of Trades', '% YTD of SP500', 'Spread Close-Open', 'Min Inc % in 180 days', 'Min Inc % in 365 days', 'Close Price GR', '% Return of Company', 'No.of Shares', 'Total Turnover (Rs.)', 'Beta', 'Open Price', 'Close Price', 'Low Price', 'Expenditure  last 4 quarters', 'EPS last 2 quarters', 'CP % LV 180 days', '% YTD of Company', 'Alpha', 'Close Price of SP500', 'CP % LV 365 days', 'Lower Band', 'High Price', 'CP % HV 365 days', 'Net Profit  last 8 quarters', 'Upper Band', 'Band Area', 'Avg Inc % in 90 days', 'Avg Inc % in 180 days', 'Spread High-Low', 'Rate', 'Max Dec % in 90 days', 'Avg Dec % in 90 days']
backward_features ['Open Price', 'High Price', 'Close Price', 'No. of Trades', 'Spread High-Low', 'Spread Close-Open', '% Ret

# Feature Importance

In [261]:
mycols = ["company",'Open Price', 'High Price', 'Low Price','WAP','No.of Shares', 'No. of Trades', 'Total Turnover (Rs.)',
       'Deliverable Quantity', '% Deli. Qty to Traded Qty', 'Spread High-Low','Spread Close-Open']
feature_importance = pd.DataFrame(columns=mycols)

In [262]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if filename.startswith("gr"):
        start = time.time()
        df = pd.read_csv(os.path.join(path,"Data\Stock\\"+filename))
        # df = pre_process_data(df,60)
        name = os.path.splitext(filename)[0]
        cols = ['Open Price', 'High Price', 'Low Price','WAP','No.of Shares', 'No. of Trades', 'Total Turnover (Rs.)',
       'Deliverable Quantity', '% Deli. Qty to Traded Qty', 'Spread High-Low','Spread Close-Open']
        depd = "Close Price"
        X = df[cols]
        Y = df[depd]
        model = LinearRegression(fit_intercept=True)
        model.fit(X, Y)
        data = [name] + [round(i,6) for i in list(model.coef_)]
        feature_importance.loc[feature_importance.shape[0]] = data

Wall time: 1 s


In [263]:
feature_importance

Unnamed: 0,company,Open Price,High Price,Low Price,WAP,No.of Shares,No. of Trades,Total Turnover (Rs.),Deliverable Quantity,% Deli. Qty to Traded Qty,Spread High-Low,Spread Close-Open
0,gr500112,0.859388,-0.033889,0.103027,0.070946,-9e-06,0.000331,-0.0,6e-06,0.11607,0.71834,9.084646
1,gr500180,0.111109,0.128058,0.111015,0.649267,-0.0,-0.000618,-0.0,2.2e-05,-0.065342,0.288567,2.984352
2,gr500182,1.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,1.0
3,gr500209,-0.229153,0.341475,0.375629,0.512259,1e-06,-4.1e-05,0.0,-2e-06,-0.000677,0.006352,0.037799
4,gr500325,-0.278226,0.45242,0.430554,0.394406,1e-06,-1.9e-05,-0.0,-0.0,0.006209,0.024996,0.112441
5,gr500680,1.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,1.0
6,gr507685,0.136819,0.143186,0.132503,0.586311,0.0,1.5e-05,-0.0,0.0,0.003078,0.004307,0.150056
7,gr530965,-0.036804,0.100067,0.105504,0.831186,0.0,-2.7e-05,-0.0,-0.0,0.002448,0.002472,0.058488
8,gr532174,0.621169,0.11906,0.004834,0.253964,-1e-06,7.7e-05,-0.0,1e-06,0.063648,-0.221987,3.232057
9,gr532210,0.929765,-0.045902,0.103453,0.012684,0.0,-1.3e-05,-0.0,-0.0,0.000208,0.067524,0.845155
