In [None]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import GridSearchCV
import time 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib.patches as patches
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif
from sklearn.metrics import confusion_matrix
import math
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import dataframe_image as dfi
import traceback
import prettytable
import io

In [None]:
import warnings; warnings.simplefilter('ignore')

In [None]:
path = os.getcwd()
path

# PreProcessing Data    

In [None]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column


In [None]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

# Parameters Tuning

In [None]:
def parameters_tuning(penalty,C,X_train, y_train):
    logistic = LogisticRegression()
    hyperparameters = dict(C=C, penalty=penalty)
    try:
        clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
        best_model = clf.fit(X_train, y_train)
    except:
        hyperparameters["penalty"] = ['none','l2']
        clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
        best_model = clf.fit(X_train, y_train)
    penalty = best_model.best_estimator_.get_params()["penalty"]
    C = best_model.best_estimator_.get_params()["C"]
    return (penalty,C)


# OLS Regression

In [None]:
def OLS_Regression(X_train,Y_train):
    X_train = np.array(X_train, dtype=float)
    ols_model = sm.OLS(Y_train, X_train).fit()
    rsquared_adj = ols_model.rsquared_adj
    aic = ols_model.aic
    bic = ols_model.bic
    fvalue = ols_model.fvalue
    return {"rsquared_adj":rsquared_adj,"aic":aic,"bic":bic,"fvalue":fvalue}

# Confusion Matrix

### true negatives  {0,0},
### false negatives {1,0},
### true positives  {1,1},
### false positives {0,1}.

In [None]:
def create_confusion_matrix(y_pred,y_true):
    
    cm = confusion_matrix(y_true,y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    precision = metrics.precision_score(y_true,y_pred)
    recall = metrics.recall_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1_score":f1_score,"confusion matrix":cm}

    # accuracy = float(cm.diagonal().sum())/len(y_test)
    # TN = cm[0][0]
    # FP = cm[0][1]
    # FN = cm[1][0]
    # TP = cm[1][1]
    
    # precision1 = TP / (TP + FP)
    # precision1 = 0 if math.isnan(precision1) else precision1
    
    # recall1 = TP / (TP + FN)
    # recall1 = 0 if math.isnan(recall1) else recall1

    # f_measure = (2 * precision1 * recall1)/(precision1 + recall1)
    
    # precision0 = TN / (TN + FN)
    # precision0 = 0 if math.isnan(precision0) else precision0
    
    # recall0 = TN / (FP  + TN)
    # recall0 = 0 if math.isnan(recall0) else recall0


# Logistic Regression

In [None]:
def logistic_regression(df,threshold):
    df["Target"] = df["Next Day Close Price GR"].apply(lambda x : 1 if x >= threshold else 0)
    X = df[df.columns[:-2]]
    Y = df["Target"]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    penalty = ['l1', 'l2','none']
    C = [0.001,0.01,0.1,1,10,100]
    penalty,C = parameters_tuning(penalty,C,X_train, y_train)
    logmodel = LogisticRegression(penalty = penalty, C = C,random_state = 0)
    logmodel.fit(X_train, y_train)
    y_pred = logmodel.predict(X_test)

    confidence = logmodel.score(X_test, y_test)
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    confusion_mat =  create_confusion_matrix(y_pred,y_test)
    ols_values = OLS_Regression(X_train,y_train)
    myres = {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"confidence":confidence}
    myres.update(confusion_mat)
    myres.update({"f-value":ols_values["fvalue"]})
    return myres

In [None]:
top = pd.read_csv('top.csv')
top

In [None]:
%%time
df_cols = ['features', 'threshold', 'pvalue', 'fvalue','root_mean_squared_error','mean_absolute_error', 'mean_squared_error', 'confidence', 'accuracy',
       'precision', 'recall', 'f1_score', 'f-value',"confusion matrix"]
for _,row in top.iterrows():
    stock_result = pd.DataFrame(columns=df_cols)
    name = row['security id']
    company = {}
    df = pd.read_csv(os.path.join(path,"Data\Stock\\"+"gr"+str(name)+".csv"))
    df = pre_process_data(df,60)
    column = "Next Day Close Price GR"
    (df,column) = dependent_column(df,column)

    threshold_values = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]
    for threshold in threshold_values:
        try:
            target = df["Next Day Close Price GR"].apply(lambda x : 1 if x >= threshold else 0)
            X = df[df.columns[:-2]]
            Y = target
            fvalues, pvalues = f_classif(X,Y)
            fvalues = dict(zip(X.columns,fvalues))
            pvalues = dict(zip(X.columns,pvalues))
            dataset = []
            p_range = [0.05,0.1,0.15,0.2,0.25]
            for p in p_range:
                mycols = list()
                for col,val in pvalues.items():
                    if val <= p:
                        mycols.append(col)
                if mycols == []:
                    res = {"features":mycols,"threshold":threshold,"pvalue":p}
                    dataset.append(res)
                    continue
                res = logistic_regression(df[mycols+[column]],threshold)
                fsol = [b for a,b in fvalues.items() if a in mycols]
                rf = np.max(fsol)
                res.update({"features":mycols,"threshold":threshold,"pvalue":p,"fvalue":rf})
                dataset.append(res)

            f_range = [0.1,0.5,1,2,5,10,100]
            for f in f_range:
                mycols = list()
                for col,val in fvalues.items():
                    if val >= f:
                        mycols.append(col)

                if mycols == []:
                    res = {"features":mycols,"threshold":threshold,"fvalue":f}
                    dataset.append(res)
                    continue
                res = logistic_regression(df[mycols+[column]],threshold)
                psol = [b for a,b in pvalues.items() if a in mycols]
                rp = np.max(psol)
                res.update({"features":mycols,"threshold":threshold,"fvalue":f,"pvalue":rp})
                dataset.append(res)
        except Exception as e:
            traceback.print_exc()
        res_df = pd.DataFrame(dataset)
        stock_result = stock_result.append(res_df,ignore_index=True)
    stock_result.to_csv(os.path.join(path,"Results\\"+str(name)+"_logistic"+".csv"),index=None,float_format="%.6f")

# Pretty Table

In [343]:
top = pd.read_csv("top.csv")

In [345]:
tcols = ['threshold', 'pvalue', 'fvalue', 'root_mean_squared_error','mean_absolute_error', 'mean_squared_error', 'accuracy','precision', 'recall', 'f1_score']
best_df_cols = ["security id","name","threshold","pvalue","fvalue"]
best_df = pd.DataFrame()
for filename in os.listdir(os.path.join(path,"Results")):
    if filename.endswith("_logistic.csv"):
        log_df = pd.read_csv(os.path.join(path,"Results\\"+filename))
        stock_name = top[top["security id"] == int(filename[:6])]["name"].values[0]
        log_df.fvalue = log_df.fvalue.apply(np.ceil)
        log_df = log_df[tcols]
        log_df.dropna(how='any',inplace=True)
        table = prettytable.PrettyTable()
        table.field_names = tcols
        table.title = stock_name + " " + filename[:-4]
        for _, row in log_df.iterrows():
            row = [round(r,6) for r in row]
            table.add_row(row)
        print(table)
        log_df["security id"] = filename[:6]
        log_df["name"] = stock_name
        for name, log in log_df.groupby(by=["threshold"]):
            log.sort_values(by=["f1_score"],ascending=[False])
            best_df = best_df.append(log[best_df_cols].head(1),ignore_index=True)
best_df.to_csv(os.path.join(path,"Results\\"+"best_logistic_model"+".csv"),index=None)

+-------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                               SBIN 500112_logistic                                                              |
+-----------+----------+--------+-------------------------+---------------------+--------------------+----------+-----------+----------+----------+
| threshold |  pvalue  | fvalue | root_mean_squared_error | mean_absolute_error | mean_squared_error | accuracy | precision |  recall  | f1_score |
+-----------+----------+--------+-------------------------+---------------------+--------------------+----------+-----------+----------+----------+
|    0.01   |   0.05   |  6.0   |         0.539116        |       0.290646      |      0.290646      | 0.709354 |    0.5    | 0.007663 | 0.015094 |
|    0.01   |   0.1    |  6.0   |         0.541177        |       0.292873      |      0.292873      | 0.707127 

In [348]:
best_model = prettytable.PrettyTable()
best_model.field_names = best_df_cols
best_model.title = "logistic regression best model for each stock"
for _, row in best_df.iterrows():
    row = [round(r,6) if isinstance(r,float) else r for r in row]
    best_model.add_row(row)
print(best_model)

+---------------------------------------------------------------------------+
|               logistic regression best model for each stock               |
+-------------+-------------------------------+-----------+--------+--------+
| security id |              name             | threshold | pvalue | fvalue |
+-------------+-------------------------------+-----------+--------+--------+
|    500112   |              SBIN             |    0.01   |  0.05  |  6.0   |
|    500112   |              SBIN             |    0.02   |  0.05  |  8.0   |
|    500112   |              SBIN             |    0.03   |  0.05  |  15.0  |
|    500112   |              SBIN             |    0.04   |  0.05  |  23.0  |
|    500112   |              SBIN             |    0.05   |  0.05  |  45.0  |
|    500112   |              SBIN             |    0.06   |  0.05  |  80.0  |
|    500112   |              SBIN             |    0.07   |  0.05  | 124.0  |
|    500112   |              SBIN             |    0.08   |  0.0