In [1]:
import os
import gc  #This is garbage collector 
import sys #System 
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler as scale
from sklearn.decomposition import PCA
from sklearn.cluster import k_means


from random import seed
from random import randint
seed(1)



In [68]:
def read_data():
    df = pd.read_csv("Inputs/train.csv")
    return df 

def convert_to_pickle(df, loc_with_name):
    with open(loc_with_name, 'wb') as fp:
        pickle.dump(df, fp)

def read_pickle(loc_with_name):
    with open (loc_with_name, 'rb') as fp:
        df = pickle.load(fp)
    return df 

def preprocess(df):
    #Dataset length 
    org_len = len(df)

    #Saving memory by changing the dtype
    for i in df:
        if df[i].dtype == np.float64:
            if (((df[i] < .0001) & (df[i] > -.0001)).mean()) < .001:
                df[i] = df[i].astype(np.float32)
                gc.collect()

    #Changing the data types 
    df.date = df.date.astype(np.int16)
    df.ts_id = df.ts_id.astype(np.int32)
    df.feature_0 = df.feature_0.astype(np.int32)

    #Sorting with respect to date
    df.sort_values(by = ['date','ts_id'],inplace = True)

    #Create a action column - 1 if the resp is >0 and 0 if resp < 0 
    df['action'] = np.where(df['resp']>0 , 1 , 0 )
    df.action = df.action.astype("category")
    
    return df


def return_per_day(resp, weight, action):
    returns = np.multiply(np.multiply(resp, weight), action)
    return sum(returns)

def sharpe_score(Pi_list):
    i = len(Pi_list)
    num = sum(Pi_list)
    den = np.sqrt(sum([i ** 2 for i in Pi_list]))
    
    return num/den * np.sqrt(252/i)

def generate_random_block(data, size):
    date_start = int(min(data.date))
    date_end = int(max(data.date)- size)
    
    block_start = randint(date_start, date_end)
    block_end = block_start + size 
    
    block_data = data.loc[data.date.isin([i for i in range(block_start, block_end +1 )])]
    
    return block_data
    
def generate_sequential_blocks(data, size):
    n_blocks = len(data)//size 
    date_start = int(min(data.date))
    date_end = int(max(data.date)- size)
    
    blocks_list = []
    
    for i in range(date_start, date_end):
        block_start = i 
        block_end = i+ size
        block_data = data.loc[data.date.isin([i for i in range(block_start, block_end +1 )])]
        
        blocks_list.append(block_data)
    return blocks_list


In [30]:
data = read_pickle( 'Inputs/sub_train')
data = preprocess(data)
data = data[['date','resp', 'action', 'weight']]
data

In [186]:
#Consider the full data and generate continuous blocks of size 30 days 
block_list = generate_sequential_blocks(data, size)

#For each of the block in this list, there are 30 days , for each day calcualte the daily returns 
Pi_list = []
for block in block_list:
    for date in block.date.unique():
        today = block[block.date == date]
        Pi = (return_per_day(np.array(today['resp']), np.array(today['weight']), np.array(today['action'])))
        Pi_list.append(Pi)
        
sharpe = sharpe_score(Pi_list)
sharpe 

13.557598692757946

13.557598692757946

In [175]:
block = blocks_list[0]

In [176]:
block

Unnamed: 0,date,resp,action,weight
579319,100,-0.002914,0,8.901968
579320,100,-0.017439,0,0.827245
579321,100,0.021681,1,0.239052
579322,100,-0.000452,0,0.000000
579323,100,0.009154,1,0.493212
...,...,...,...,...
688399,130,-0.009760,0,66.844209
688400,130,-0.002937,0,60.742521
688401,130,0.005895,1,6.558912
688402,130,0.000853,1,49.911758


        date      resp action    weight
579319   100 -0.002914      0  8.901968
579320   100 -0.017439      0  0.827245
579321   100  0.021681      1  0.239052
579322   100 -0.000452      0  0.000000
579323   100  0.009154      1  0.493212
...      ...       ...    ...       ...
582471   100 -0.003054      0  0.000000
582472   100 -0.000199      0  0.000000
582473   100  0.007916      1  0.000000
582474   100 -0.003850      0  0.000000
582475   100 -0.001398      0  0.000000

[3157 rows x 4 columns]
        date      resp action     weight
582476   101  0.007985      1   0.121027
582477   101 -0.004715      0   3.790635
582478   101 -0.013039      0   0.000000
582479   101  0.047805      1   0.277944
582480   101  0.003788      1   0.199900
...      ...       ...    ...        ...
586593   101 -0.001045      0   0.000000
586594   101  0.002197      1   0.000000
586595   101  0.004010      1   0.000000
586596   101  0.000575      1   0.000000
586597   101  0.003852      1  42.072853

[4

array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130], dtype=int16)