In [1]:
%pip install text-to-image

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from prettytable import PrettyTable
import time 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.patches as patches
import warnings; warnings.simplefilter('ignore')
import sys
import text_to_image
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression

In [3]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

# PreProcessing Data

In [5]:
def pre_process_data(data,null_threshold):
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if ((null_threshold * total / 100) < data[col].isnull().sum()):
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column

In [6]:
def dependent_column(data,column):
    cols = [col for col in data.columns if ("next" not in col.lower() and col.lower().endswith("gr"))]
    cols.append(column)
    data = data[cols]
    return (data,column)

# OLS Regression

In [7]:
def OLS_Regression(X_train,Y_train,cols):
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
#     print(list(zip(list(cols),ols_model.pvalues)))
    rsquared_adj = ols_model.rsquared_adj
    aic = ols_model.aic
    bic = ols_model.bic
    fvalue = ols_model.fvalue
    return {"rsquared_adj":rsquared_adj,"aic":aic,"bic":bic,"fvalue":fvalue}

# Linear Regression

In [8]:
def linear_regression(data, y):
    # print("------ Linear Regression ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model = LinearRegression(fit_intercept = True)  
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    confidence = model.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train,X.columns)
    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values, "Confidence" : confidence, "Predicted" : pred, "Actual" : Y_test}

# linear regression with forward selection

In [9]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [10]:
def linear_regression_forward_selection(data,y):
    # print("------ Linear Regression Forward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    forward_features = forward_selection(X,Y)
    print("Features obtained from Forward Selection : ")
    print(forward_features)
    return linear_regression(data[forward_features+[y]],y)

# linear regression with backward elimination

In [11]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

In [12]:
def linear_regression_backward_selection(data,y):
    # print("------ Linear Regression Backward Selection ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    backward_features = backward_elimination(X,Y)
    print("Features obtained from Backward Elimination : ")
    print(backward_features)
    return linear_regression(data[backward_features+[y]],y)

# Using Inbuilt Forward Selection Method

In [13]:
def forward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=True,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [14]:
def linear_regression_forward_selection_inbuit(data,y):
    # print("------ Linear Regression Forward Selection Inbuilt ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = forward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("forwardFeatures.csv",index=None)
    return df

# Using Inbuilt Backward Elimination Method

In [15]:
def backward_selection_inbuilt(X,Y,k,score):
    sfs = SFS(LinearRegression(),k_features=k,forward=False,floating=False,scoring = score,cv = 0)
    sfs.fit(X, Y)
    lst = list(sfs.k_feature_names_)
    return lst

In [16]:
def linear_regression_backward_selection_inbuit(data,y):
    # print("------ Linear Regression Backward Selection Inbuilt ------")
    X = data[data.columns[:-1]]
    Y = data[y].values
    scores = ['explained_variance','max_error','neg_mean_absolute_error','neg_mean_squared_error',
                  'neg_root_mean_squared_error','neg_median_absolute_error','r2']
    df = pd.DataFrame(columns=scores,index=range(1,data.shape[1]+1))
    for k in range(1,data.shape[1]+1):
        for score in scores:
            sfs = backward_selection_inbuilt(X,Y,k,score)
            df.loc[k,score] = sfs
    df.to_csv("backwardFeatures.csv",index=None)
    return df

# Ridge Regression

In [17]:
def bestparams_ridge(alpha,X_train,Y_train):
    
    ridge = Ridge(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [18]:
def ridge_regression(data,y):
    
    # print("------ Ridge Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_ridge(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_ridge(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Ridge regression with the above best alpha value and the train datasets.
    clf = Ridge(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)

    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Lasso Regression

In [19]:
def bestparams_lasso(alpha,X_train,Y_train):
    
    lasso = Lasso(alpha=1).fit(X_train,Y_train)
    
    param_grid = dict(alpha=alpha)
    
    grid = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2')
    
    grid.fit(X_train,Y_train)
    
    alpha_val = grid.best_estimator_.alpha
    
    return alpha_val

In [20]:
def lasso_regression(data,y):
    
    # print("------ Lasso Regression ------")

    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    alpha = np.array([1,0.1,0.01,0.001,0.0001,0])
    best = bestparams_lasso(alpha,X_train,Y_train)
    # print("Best Alpha:", best) # best alpha value
    
    # Re-selecting the alpha value based on the above selected alpha value
    alpha1 = np.arange(best-10,best+10)
    best_alpha = bestparams_lasso(alpha1,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha)
    # Lasso regression with the above best alpha value and the train datasets.
    clf = Lasso(alpha=best_alpha)
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}

# Elastic Regression

In [21]:
def bestparams_elastic(alphas,l1,X_train,Y_train):
    
    elastic_net = ElasticNet(alpha=1, l1_ratio=0.2).fit(X_train, Y_train)
    param_grid = dict(alpha=alphas, l1_ratio=l1)
    
    grid = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2')
    
    grid_result = grid.fit(X_train, Y_train)
    
    alpha_val = grid_result.best_estimator_.alpha
    l1_val = grid_result.best_estimator_.l1_ratio
    
    return (alpha_val,l1_val)

In [22]:
def elastic_net_regression(data,y):

    # print("------ Elastic Net Regression ------")
    
    X = data[data.columns[:-1]]
    Y = data[y].values
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
    # selection of alpha value from the respective array values
    
    alpha = np.array([0,0.1,0.001,0.0001,1])
    l1_ratio = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    
    best = bestparams_elastic(alpha,l1_ratio,X_train,Y_train)
    # print("Best Alpha:", best[0]) # best alpha value
    # print("Best l1 - value:", best[1])
    
    # Re-selecting the alpha value based on the above selected alpha value
    
    alpha1 = np.arange(best[0]/10,best[0]*10)
    best_alpha = bestparams_elastic(alpha1,l1_ratio,X_train,Y_train)
    # print("Best Alpha after tuning : ", best_alpha[0])
    # print("Best l1 after tuning : ", best_alpha[1])
    clf = ElasticNet(alpha=best_alpha[0],l1_ratio = best_alpha[1])
    clf.fit(X_train, Y_train)
    
    pred = clf.predict(X_test)
    
    confidence = clf.score(X_test, Y_test)
    
    rmse = sqrt(metrics.mean_squared_error(Y_test, pred))
    mae = metrics.mean_absolute_error(Y_test, pred)
    mse = metrics.mean_squared_error(Y_test, pred)
    ols_values = OLS_Regression(X_train,Y_train)
    
    # coeff_vs_Regularization(X_train,Y_train)

    return {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"OLS":ols_values}
    

In [23]:
def coeff_vs_Regularization(X_train,Y_train):
    coefs = []
    n_alphas = 200
    alphas = np.logspace(-10, -2, n_alphas)

    for a in alphas:
        elastic = ElasticNet(alpha=a)
        elastic.fit(X_train, Y_train)
        coefs.append(elastic.coef_)
    
    ax = plt.gca()

    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha(log scale)')
    plt.ylabel('Coefficients')
    plt.title('ElasticNet - Coefficients Vs Regularization')
    plt.axis('tight')
    plt.show()

# Finding results from each set of important features

In [24]:
def get_results_from_FI_Coeffiecients(df, name, column, results):
    print("Features Importance using Coefficients")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    model_linear = LinearRegression(fit_intercept=True)
    model_linear.fit(X, Y)
    col_coef = list(df.columns)
    res_coef = [round(i,6) for i in list(model_linear.coef_)]
    rc_coef = list(zip(col_coef, res_coef))
    coef_features = []
    coef = [0.1]
    for cf in coef:
        for i in range(len(rc_coef)):
            if ((abs(rc_coef[i][1])) > cf):
                coef_features.append(rc_coef[i][0])
        print("Features obtained from coefficients greater than " + str(cf) + " : ")
        print("--------------------------------------")
        print(coef_features)
        if (len(coef_features) == 0):
            continue
        coef_features.append(column)
        df_fic = df[coef_features]
        linear_model_result = linear_regression(df_fic, column)
        print("Linear Model fitted using columns obtained from feature importance using coefficients : ")
        pred = linear_model_result['Predicted']
        actual = linear_model_result['Actual']
        pred_actual = pd.DataFrame(list(zip(pred, actual)), 
                   columns =['Predicted Values', 'Actual Values'])
        pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFIC" + str(cf) + "_" + name + ".csv" , index=False) 
        same_dir = 0
        diff_dir = 0
        for a, b in zip(pred, actual) :
            if (a * b > 0):
                same_dir += 1
            else:
                diff_dir += 1
        print("Values in Same direction -----> ----->", same_dir)
        print("Values in Opposite direction <----- -----> ", diff_dir)
        print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
        results["LinearFIC" + str(cf)] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")

In [25]:
def get_results_from_FI_PValue(df, name, column, results):
    print("Features Importance using p-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    col_pval = list(df.columns)
    pvals = list(ols_model.pvalues)
    pvals_cols = list(zip(col_pval, pvals))
    p = [0.02, 0.05, 0.1, 0.2]
    for pv in p:
        pval_features = []
        for i in range(len(pvals_cols)):
            if (pvals_cols[i][1] < pv):
                pval_features.append(pvals_cols[i][0])
        print("Features obtained from p-values less than " + str(pv) + " : ")
        print("-------------------------------------------------")
        print(pval_features)
        if (len(pval_features) == 0):
            continue
        pval_features.append(column)
        df_fip = df[pval_features]
        linear_model_result = linear_regression(df_fip, column)
        print("Linear Model fitted using columns obtained from feature importance using p-values : ")
        pred = linear_model_result['Predicted']
        actual = linear_model_result['Actual']
        pred_actual = pd.DataFrame(list(zip(pred, actual)), 
                   columns =['Predicted Values', 'Actual Values'])
        pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFIP" + str(pv) + "_" + name + ".csv" , index=False) 
        same_dir = 0
        diff_dir = 0
        for a, b in zip(pred, actual) :
            if (a * b > 0):
                same_dir += 1
            else:
                diff_dir += 1
        print("Values in Same direction -----> ----->", same_dir)
        print("Values in Opposite direction <----- -----> ", diff_dir)
        print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
        results["LinearFIP" + str(pv)] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")
    

In [26]:
def get_results_from_FI_FValues(df, name, column, results):
    print("Features Importance using f-value")
    print("*****************************************************************************************")
    X = df[df.columns[:-1]]
    Y = df[column].values
    fval_cols = X.columns
    freg_res = f_regression(X, Y)
#     print(freg_res[0])
    fvals = freg_res[0]
    fc = list(zip(fval_cols, fvals))
    f = [1, 10, 100, 1000]
    for fv in f :
        fval_features = []
        for i in range(len(fc)):
            if ((abs(fc[i][1])) > fv):
                fval_features.append(fc[i][0])
        print("Features obtained from f-values greater than " + str(fv) + " : ")
        print("--------------------------------------")
        print(fval_features)
        if (len(fval_features) == 0):
            continue
        fval_features.append(column)
        df_fif = df[fval_features]
        linear_model_result = linear_regression(df_fif, column)
        print("Linear Model fitted using columns obtained from feature importance using f-values : ")
        pred = linear_model_result['Predicted']
        actual = linear_model_result['Actual']
        pred_actual = pd.DataFrame(list(zip(pred, actual)), 
                   columns =['Predicted Values', 'Actual Values'])
        pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFIF" + str(fv) + "_" + name + ".csv" , index=False) 
        same_dir = 0
        diff_dir = 0
        for a, b in zip(pred, actual) :
            if (a * b > 0):
                same_dir += 1
            else:
                diff_dir += 1
        print("Values in Same direction -----> ----->", same_dir)
        print("Values in Opposite direction <----- -----> ", diff_dir)
        print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
        results["LinearFIF" + str(fv)] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")


In [27]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    lfs_res = linear_regression_forward_selection(df1, column)
    pred = lfs_res['Predicted']
    actual = lfs_res['Actual']
    pred_actual = pd.DataFrame(list(zip(pred, actual)), 
               columns =['Predicted Values', 'Actual Values'])
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFI_FS_" + name + ".csv" , index=False) 
    same_dir = 0
    diff_dir = 0
    for a, b in zip(pred, actual) :
        if (a * b > 0):
            same_dir += 1
        else:
            diff_dir += 1
    print("Values in Same direction -----> ----->", same_dir)
    print("Values in Opposite direction <----- -----> ", diff_dir)
    print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
    results["FI_FS"] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")

In [28]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    lfs_res = linear_regression_backward_selection(df1, column)
    pred = lfs_res['Predicted']
    actual = lfs_res['Actual']
    pred_actual = pd.DataFrame(list(zip(pred, actual)), 
               columns =['Predicted Values', 'Actual Values'])
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\Models Results\\LinearFI_BE_" + name + ".csv" , index=False) 
    same_dir = 0
    diff_dir = 0
    for a, b in zip(pred, actual) :
        if (a * b > 0):
            same_dir += 1
        else:
            diff_dir += 1
    print("Values in Same direction -----> ----->", same_dir)
    print("Values in Opposite direction <----- -----> ", diff_dir)
    print("Percentage of correct direction : ", (same_dir / (same_dir + diff_dir)))
    results["FI_BE"] = (same_dir / (same_dir + diff_dir))
    print("*****************************************************************************************")

In [29]:
def get_results_from_each_set(data, name):
    df = pre_process_data(data, 60)
    column = "Next Day Close Price GR"
    (df1, column) = dependent_column(df, column)
    results = {}
    get_results_from_FI_Coeffiecients(df1, name, column, results)
    get_results_from_FI_PValue(df1, name, column, results)
    get_results_from_FI_ForwardSelection(df1, name, column, results)
    get_results_from_FI_BackwardElimination(df1, name, column, results)
    get_results_from_FI_FValues(df1, name, column, results)
    sorted_results = sorted(results.items(), key=lambda item: item[1])
    print("Maximum correct direction values are obtained for {} with a percentage of {}.".format(sorted_results[-1][0], sorted_results[-1][1]))

In [30]:
%%time
for filename in os.listdir(os.path.join(path,"Data/Stock")):
    if (filename.startswith("gr")):
        df_linear = pd.read_csv(os.path.join(path,"Data\Stock\\" + filename))
        name = os.path.join(path, "Data\Stock\\" + filename).split("\\")[-1]
        stock = name[2 : 8]
        orig_stdout = sys.stdout
        sys.stdout = open("gr" + stock + "res.txt", "w")
        print("For stock : ", stock)
        print("#################################################################################################################")
        get_results_from_each_set(df_linear, name)
        print("#################################################################################################################")
        sys.stdout.close()
        sys.stdout = orig_stdout

Wall time: 15.9 s


In [31]:
# orig_stdout = sys.stdout
# sys.stdout = open("500112res.txt", "w")

In [32]:
# path = "C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Stock\\gr500112.csv"
# df_lin = pd.read_csv(path)
# name = path.split("\\")[-1]
# stock = name[2 : 8]
# print("For stock : ", stock)
# print("#################################################################################################################")
# get_results_from_each_set(df_lin, name)
# print("#################################################################################################################")

In [33]:
# sys.stdout.close()
# sys.stdout=orig_stdout