In [1]:
import os
import gc  #This is garbage collector 
import sys #System 
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle
        
# from sklearn.preprocessing import StandardScaler as scale
# from sklearn.decomposition import PCA
# from sklearn.cluster import k_means


from random import seed
from random import randint
seed(1)



In [2]:
def read_data():
    df = pd.read_csv("Inputs/train.csv")
    return df 

def convert_to_pickle(df, loc_with_name):
    with open(loc_with_name, 'wb') as fp:
        pickle.dump(df, fp)

def read_pickle(loc_with_name):
    with open (loc_with_name, 'rb') as fp:
        df = pickle.load(fp)
    return df 

def preprocess(df):
    #Dataset length 
    org_len = len(df)

    #Saving memory by changing the dtype
    for i in df:
        if df[i].dtype == np.float64:
            if (((df[i] < .0001) & (df[i] > -.0001)).mean()) < .001:
                df[i] = df[i].astype(np.float32)
                gc.collect()

    #Changing the data types 
    df.date = df.date.astype(np.int16)
    df.ts_id = df.ts_id.astype(np.int32)
    df.feature_0 = df.feature_0.astype(np.int32)

    #Sorting with respect to date
    df.sort_values(by = ['date','ts_id'],inplace = True)

    #Create a action column - 1 if the resp is >0 and 0 if resp < 0 
    df['action'] = np.where(df['resp'] > 0 , 1 , 0 )
    df.action = df.action.astype("category")
    
    return df


def return_per_day(resp, weight, action):
    returns = np.multiply(np.multiply(resp, weight), action)
    return sum(returns)

def sharpe_score(Pi_list):
    i = len(Pi_list)
    num = sum(Pi_list)
    den = np.sqrt(sum([i ** 2 for i in Pi_list]))
    
    return (num/den) * np.sqrt(252/i)

def generate_random_block(data, size):
    date_start = int(min(data.date))
    date_end = int(max(data.date)- size)
    
    block_start = randint(date_start, date_end)
    block_end = block_start + size 
    
    block_data = data.loc[data.date.isin([i for i in range(block_start, block_end +1 )])]
    
    return block_data
    
def generate_sequential_blocks(data, size):
    n_blocks = len(data)//size 
    date_start = int(min(data.date))
    date_end = int(max(data.date)- size)
    
    blocks_list = []
    
    for i in range(date_start, date_end):
        block_start = i 
        block_end = i+ size
        block_data = data.loc[data.date.isin([i for i in range(block_start, block_end +1 )])]
        
        blocks_list.append(block_data)
    return blocks_list


def split_train_test(sub_data, split):
    val = int(sub_data.date.nunique() * split)
    date_start = int(min(sub_data.date))
    date_end = int(max(data.date))
    train = sub_data.loc[sub_data.date.isin([i for i in range(date_start, date_start + val +1 )])]
    test = sub_data.loc[sub_data.date.isin([i for i in range( date_start + val +1,date_end)])]
    
    return train , test 


def standardize(X_train, y_train, X_test, y_test, do_y):
    mean = np.mean(X_train)
    std = np.std(X_train)
    X_train = (X_train - mean)/std
    X_test = (X_test - mean)/std
    
    if do_y == True : 
        mean_y = np.mean(y_train)
        std_y = np.std(y_train)
        y_train = (y_train - mean_y)/std_y
        y_test = (y_test - mean_y)/std_y
    
    return X_train, y_train, X_test, y_test

In [3]:
#This reads full data 
#data = read_pickle( 'Inputs/sub_train')
data = pd.read_csv("Inputs/train.csv")
full_data = data


data = full_data 
data = preprocess(data)
data = data[['date','resp', 'action', 'weight']]
data


#Running for full data at once

#For each of the block in this list, there are 30 days , for each day calcualte the daily returns 
sharpe_list = []

for date in full_data.date.unique():
    today = block[block.date == date]
    Pi = (return_per_day(np.array(today['resp']), np.array(today['weight']), np.array(today['action'])))
    Pi_list.append(Pi)


sharpe = sharpe_score(Pi_list)
sharpe

KeyboardInterrupt: 

## Taking just 100 days of data

In [3]:
#This is just 100 days data
sub_data = read_pickle( 'Inputs/sub_train')
#data = pd.read_csv("Inputs/train.csv")
full_sub_data = sub_data
sub_data = preprocess(sub_data)
sub_data = sub_data[['date','resp', 'action', 'weight']]
sub_data

Unnamed: 0,date,resp,action,weight
579319,100,-0.002914,0,8.901968
579320,100,-0.017439,0,0.827245
579321,100,0.021681,1,0.239052
579322,100,-0.000452,0,0.000000
579323,100,0.009154,1,0.493212
...,...,...,...,...
964131,200,0.007735,1,0.000000
964132,200,-0.000410,0,2.257632
964133,200,-0.004426,0,59.208312
964134,200,-0.001416,0,38.369943


In [15]:
#Consider the full sub data and generate continuous blocks of size 30 days - batch wise implementation 
size = 99 
block_list = generate_sequential_blocks(sub_data, size)
print(len(block_list))

#For each of the block in this list, there are 30 days , for each day calcualte the daily returns 
sharpe_list = []

for block in block_list:
    Pi_list = []
    for date in block.date.unique():
        today = block[block.date == date]
        Pi = (return_per_day(np.array(today['resp']), np.array(today['weight']), np.array(today['action'])))
        Pi_list.append(Pi)
    
        
    sharpe = sharpe_score(Pi_list)
    sharpe_list.append(sharpe)
    
sharpe_list

1


[13.625347917277761]

# Splitting data into test and train 


In [67]:
train, test = split_train_test(sub_data, split = 0.7)

In [68]:
X_train = train[['resp','weight']]
y_train = train['action']
train_date = train['date']

X_test = test[['resp','weight']]
y_test = test['action']
test_date = test['date']

In [69]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((267402, 2), (267402,), (117415, 2), (117415,))

In [73]:
#Standardize the data  
X_train, y_train, X_test, y_test = standardize(X_train, y_train, X_test, y_test, do_y = False)


# Sending the input features through a NN model to get resp, resp1,2,3,4

In [7]:
full_sub_data.isna().sum()

date              0
weight            0
resp_1            0
resp_2            0
resp_3            0
               ... 
feature_127    1614
feature_128     412
feature_129     412
ts_id             0
action            0
Length: 139, dtype: int64

In [8]:
full_sub_data = preprocess(full_sub_data)
full_sub_data = full_sub_data.fillna(0)

In [9]:
full_sub_data.isna().sum()

date           0
weight         0
resp_1         0
resp_2         0
resp_3         0
              ..
feature_127    0
feature_128    0
feature_129    0
ts_id          0
action         0
Length: 139, dtype: int64

In [10]:
#Split into train and test 

train, test = split_train_test(full_sub_data, split = 0.8)
train.shape, test.shape


NameError: name 'data' is not defined

In [108]:
X_train = train[train.columns.difference(['date', 'weight','resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id', 'action'])]
y_train = train[['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']]
train_date = train['date']

X_test = test[test.columns.difference(['date', 'weight','resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id', 'action'])]
y_test = test[['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']]
test_date = test['date']

In [110]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((303064, 130), (303064, 5), (81753, 130), (81753, 5))

In [None]:
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)

#alpha=0.15 for full data 7 day lag

#alpha=1, 7 day lag, datasplit=0.96, use sum of footfall and technical indicators: (0.670577801048615, 0.7402082686703921)



regr = MLPRegressor(random_state=1, max_iter=500, alpha=1).fit(X_train, y_train)
y_pred_train = regr.predict(X_train)
y_pred = regr.predict(X_test)

print(regr.score(X_train, y_train), regr.score(X_test, y_test))
MSE_error = mean_squared_error(y_train, y_pred_train)
print('Train data Mean Squared Error is {}'.format(MSE_error))

MSE_error = mean_squared_error(y_test, y_pred)
print('Test data Mean Squared Error is {}'.format(MSE_error))


# If predicted resp * weight is more than 0.5 then predict action as 1. 

# Use the predicted action to find the sharpe of the data

In [None]:
#Running for full data at once

#For each of the block in this list, there are 30 days , for each day calcualte the daily returns 
for date in full_data.date.unique():
    today = block[block.date == date]
    Pi = (return_per_day(np.array(today['resp']), np.array(today['weight']), np.array(today['action'])))
    Pi_list.append(Pi)


sharpe = sharpe_score(Pi_list)
sharpe