In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.layers import Dense, Dropout
from tensorflow.keras import layers
from keras.layers import Activation
from keras.models import Sequential

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from math import sqrt

import prettytable
from multiprocessing.pool import ThreadPool


In [33]:
import warnings; warnings.simplefilter('ignore')

In [34]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    # data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data

In [35]:
path = "F:\PracticumProject\StockAnalysisTool\Data\Stock\gr500180.csv"

In [36]:
df = pd.read_csv(path)
df = pre_process_data(df,60)

In [37]:
df.shape

(2987, 111)

In [38]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

In [39]:
combinations = list()
def get_combinations(arr):
    n = len(arr)
    indices = [0 for i in range(n)]
    while (1):
        res = list()
        for i in range(n):
            res.append(arr[i][indices[i]])
        combinations.append(res)
        next = n - 1
        while (next >= 0 and
              (indices[next] + 1 >= len(arr[next]))):
            next-=1
        if (next < 0):
            return
        indices[next] += 1
        for i in range(next + 1, n):
            indices[i] = 0
u = [25,75,50,100]
b = [25,32,48,64]
e = [25,50,75,100]
t = [0.01,0.02,0.03,0.04]
arr = [u,b,e,t]
get_combinations(arr)

In [40]:
def create_model(X_train,layers=3,units = 32):
    model = Sequential()
    model.add(Dense(units=units,input_dim=(X_train.shape[1]),activation = 'relu'))
    model.add(Dense(units=units,activation = 'relu'))
#     model.add(Dropout(0.2))
#     for i in range(layers):
#         model.add(Dense(units=units,activation = 'relu'))
#         model.add(Dropout(0.2))
    model.add(Dense(units = 1,activation = 'sigmoid'))
    return model

In [41]:
def create_confusion_matrix(y_pred,y_true):
    
    cm = confusion_matrix(y_true,y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    precision = metrics.precision_score(y_true,y_pred)
    recall = metrics.recall_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1_score":f1_score,"confusion matrix":cm}

In [43]:
def back_propagation_neural_network_classification(df,column = "Next Day Close Price GR",layers=3,epochs = 5,batch_size = 32,units=50,threshold = 0.01):
    df["Target"] = df[column].apply(lambda x : 1 if x >= threshold else 0)
    print(df.shape)
    X = df.drop(columns=["Target",column])
    Y = df["Target"]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state = 0)
    # return (x_train, x_test, y_train, y_test)
    x_train = np.array(x_train)
    x_train = np.reshape(x_train,(x_train.shape[0],x_train.shape[1]))
    x_test = np.array(x_test)
    x_test = np.reshape(x_test,(x_test.shape[0],x_test.shape[1]))
    y_test = np.array(y_test)
    y_test = np.reshape(y_test,(y_test.shape[0],1))
    
    model = create_model(x_train,layers,units)
    model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=['accuracy'])
    history = model.fit(x_train,y_train,epochs = epochs,batch_size = batch_size, validation_data = (x_test,y_test),shuffle = False,verbose=0)

    y_pred = model.predict(x_test)
    y_pred = np.array(y_pred)
    y_pred = np.reshape(y_pred,(y_pred.shape[0],1)).round()
    
    myres = create_confusion_matrix(y_pred,y_test)
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    myres.update({"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"rsquared_adj":r2})
    myres.update({"threshold":threshold})
    myres.update({"batch_size":batch_size,"epochs":epochs,"units":units})
    print("done")
    return myres

In [48]:
# res  = back_propagation_neural_network_classification(df)
column = "Next Day Close Price GR"
df,column = dependent_column(df,column)

In [49]:
res  = back_propagation_neural_network_classification(df)


(2987, 22)
done


In [51]:
res

{'accuracy': 0.7971014492753623,
 'precision': 0.3333333333333333,
 'recall': 0.04,
 'f1_score': 0.07142857142857142,
 'confusion matrix': array([[708,  14],
        [168,   7]], dtype=int64),
 'root_mean_squared_error': 0.45044261646145084,
 'mean_absolute_error': 0.2028985507246377,
 'mean_squared_error': 0.2028985507246377,
 'rsquared_adj': -0.29207756232686966,
 'threshold': 0.01,
 'batch_size': 32,
 'epochs': 5,
 'units': 50}

In [24]:
(x_train, x_test, y_train, y_test) = back_propagation_neural_network_classification(df)

(2987, 112)


In [28]:
y_test.to_csv('y_test.csv')
y_train.to_csv('y_train.csv')

In [None]:
layers = 3
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(filename)
        filepath = os.path.join(dirname, filename)
        df = pd.read_csv(filepath)
        column = "Next Day Close Price GR"
        df = pre_process_data(df,60)
        (df,column) = dependent_column(df,column)
        arguments = list()
        for units,batch_size,epochs,threshold in combinations:
            data = [df,column,layers,epochs,batch_size,units,threshold]
            arguments.append(data)
        try:
            threads = ThreadPool(4)
            result = threads.starmap(back_propagation_neural_network_classification,arguments)
            resultdf = pd.DataFrame(result)
            resultdf.to_csv(os.path.join(os.getcwd(),str(filename[2:8])+"_bpnn_class"+".csv"),index=None)
        except:
            pass


In [2]:
path = os.path.join(os.getcwd(),"Results")
path

'f:\\PracticumProject\\StockAnalysisTool\\Results'

In [3]:
best_bpnn = pd.DataFrame()
for filename in os.listdir(path):
    if filename.endswith("bpnn_class.csv"):
        df = pd.read_csv(os.path.join(path,filename))
        columns = ['threshold', 'batch_size', 'epochs', 'units' ,'accuracy', 'precision', 'recall', 'f1_score','confusion matrix']
        df = df[columns]
        table = prettytable.PrettyTable()
        table.field_names = columns
        table.title = str(filename[:6]) + " bpnn classification"
        for _,row in df.iterrows():
            row = [round(r,6) if isinstance(r,(float,int)) else r for r in row]
            table.add_row(row)
        print(table)
        df["security id"] = filename[:6]
        for name, log in df.groupby(by=["threshold"]):
            log = log.sort_values(by=["f1_score"],ascending=[False])
            # cols = log.columns.tolist()
            # log = log[[cols[-1]]+cols[:-1]]
            best_bpnn = best_bpnn.append(log.head(1),ignore_index=True)
best_bpnn.to_csv(os.path.join(path,"_best_bpnn"+".csv"),index=None)

+---------------------------------------------------------------------------------------------------------+
|                                        500112 bpnn classification                                       |
+-----------+------------+--------+-------+----------+-----------+----------+----------+------------------+
| threshold | batch_size | epochs | units | accuracy | precision |  recall  | f1_score | confusion matrix |
+-----------+------------+--------+-------+----------+-----------+----------+----------+------------------+
|    0.01   |     25     |   25   |   25  | 0.701559 |  0.294118 | 0.019157 | 0.035971 |    [[625  12]    |
|           |            |        |       |          |           |          |          |    [256   5]]    |
|    0.02   |     25     |   25   |   25  | 0.849666 |    1.0    | 0.007353 | 0.014599 |    [[762   0]    |
|           |            |        |       |          |           |          |          |    [135   1]]    |
|    0.03   |     25     |  

KeyError: "['confusion matrix'] not in index"

In [4]:
columns = ['security id','threshold', 'batch_size', 'epochs', 'units' ,'accuracy', 'precision', 'recall', 'f1_score','confusion matrix']
best_bpnn = best_bpnn[columns]
besttable = prettytable.PrettyTable()
besttable.field_names = columns
besttable.title = "best bpnn classification"

for _,row in best_bpnn.iterrows():
    row = [round(r,6) if isinstance(r,(float,int)) else r for r in row]
    besttable.add_row(row)
print(besttable)

+-----------------------------------------------------------------------------------------------------------------------+
|                                                best bpnn classification                                               |
+-------------+-----------+------------+--------+-------+----------+-----------+----------+----------+------------------+
| security id | threshold | batch_size | epochs | units | accuracy | precision |  recall  | f1_score | confusion matrix |
+-------------+-----------+------------+--------+-------+----------+-----------+----------+----------+------------------+
|    500112   |    0.01   |     25     |   75   |  100  | 0.678174 |    0.4    | 0.214559 | 0.279302 |    [[553  84]    |
|             |           |            |        |       |          |           |          |          |    [205  56]]    |
|    500112   |    0.02   |     25     |  100   |  100  | 0.824053 |  0.288462 | 0.110294 | 0.159574 |    [[725  37]    |
|             |         