In [190]:
import numpy as np
import pandas as pd
from time import time
from io import StringIO
import math

import matplotlib.pyplot as plt

from datetime import timedelta
import sagemaker_pyspark, boto3
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType, StringType, TimestampType, StructType, StructField
import json
from IPython.display import display # Allows the use of display() for displaying DataFrames


from sagemaker import get_execution_role
BUCKET = 'innovationday-467664929633/OutOfSample'
OVERRIDE_PATH = None
DATA_LOCATION = 's3a://{}'.format(BUCKET)
role = get_execution_role()
pd.set_option('display.max_columns', 500)
region = boto3.Session().region_name
classpath = ":".join(sagemaker_pyspark.classpath_jars())
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath).getOrCreate()

In [182]:
#cbbc_daily_summary.csv
cbbc_daily_summary = pd.read_csv(f'{DATA_LOCATION}/cbbc_daily_summary_OOS.csv')
display(cbbc_daily_summary.head())
cbbc_daily_summary.info()

cbbc_daily_summary['date'] =  pd.to_datetime(cbbc_daily_summary['date'])



#2019-05-23
# cbbc_daily_summary = cbbc_daily_summary[cbbc_daily_summary['date'] < '2019-05-23']
#2019-06-13
# cbbc_daily_summary = cbbc_daily_summary[cbbc_daily_summary['date'] < '2019-06-13']
#2019-07-05
cbbc_daily_summary = cbbc_daily_summary[cbbc_daily_summary['date'] < '2019-07-05']


Columns (34) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,OutstandingPct,TotalIssueSize,TradingCurrency,DayHigh,DayLow,ClosingPrice,Volume,Turnover,Issuer,Underlying,BullBear,CBBCType,CBBCCategory,ListingDate,LastTradingDate,MaturityDate,MCE,Strike_CallCurrency,StrikeLevel,CallLevel,EntRatio,DelistingDate,ldt,Unnamed: 33,Unnamed: 34
0,5/2/2019,57003.hk,57003,UB#HSI RC2010B,5/2/2019,0.0,0.0,0.0,0.0,20000.0,0.01,200000000.0,HKD,0.0,0.0,0.45,0.0,0.0,UB,HSI,Bull,Standard,R,11/6/2018,,10/29/2020,N,-,24038.0,24138.0,12000,,2019-05-03D19:30:13.011137000,,
1,5/2/2019,57008.hk,57008,GS#TENCTRC1906O,5/2/2019,100000.0,0.53,0.0,0.0,9320000.0,9.32,100000000.0,HKD,0.53,0.53,0.53,100000.0,53000.0,GS,700,Bull,Standard,R,3/7/2019,,6/28/2019,N,HKD,336.98,338.98,100,,2019-05-03D19:30:13.011137000,,
2,5/2/2019,57010.hk,57010,BP#TENCTRC1907T,5/2/2019,100000.0,0.57,-100000.0,-0.57,1170000.0,2.34,50000000.0,HKD,0.57,0.57,0.59,200000.0,114000.0,BP,700,Bull,Standard,R,3/7/2019,,7/30/2019,N,HKD,332.5,335.0,100,,2019-05-03D19:30:13.011137000,,
3,5/2/2019,57016.hk,57016,JP#HSI RC2009V,5/2/2019,0.0,0.0,0.0,0.0,0.0,0.0,200000000.0,HKD,0.0,0.0,0.47,0.0,0.0,JP,HSI,Bull,Standard,R,11/6/2018,,9/29/2020,N,-,24458.0,24558.0,10000,,2019-05-03D19:30:13.011137000,,
4,5/2/2019,57017.hk,57017,JP#HSI RC2010E,5/2/2019,0.0,0.0,0.0,0.0,120000.0,0.06,200000000.0,HKD,0.0,0.0,0.57,0.0,0.0,JP,HSI,Bull,Standard,R,11/6/2018,,10/29/2020,N,-,23358.0,23458.0,10000,,2019-05-03D19:30:13.011137000,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91025 entries, 0 to 91024
Data columns (total 35 columns):
date                   91025 non-null object
sym                    91025 non-null object
CBBCCode               91025 non-null int64
CBBCName               91025 non-null object
TradeDate              91025 non-null object
CBBCsBought            80843 non-null float64
AveragePriceBought     80843 non-null float64
CBBCsSold              80843 non-null float64
AveragePriceSold       80843 non-null float64
Outstanding            80843 non-null float64
OutstandingPct         80843 non-null float64
TotalIssueSize         86945 non-null float64
TradingCurrency        91025 non-null object
DayHigh                80859 non-null float64
DayLow                 80859 non-null float64
ClosingPrice           73033 non-null float64
Volume                 80859 non-null float64
Turnover               80859 non-null float64
Issuer                 91025 non-null object
Underlying             91

In [139]:
cbbc_daily_summary['CBBCCode'].unique()

array([57003, 57008, 57010, ..., 60323, 60337, 60339])

In [183]:
cbbc_daily_summary = cbbc_daily_summary.fillna(0)
# add few column
trimData = cbbc_daily_summary.copy()

# add net sales column
trimData['target'] = trimData.apply(lambda row: -1 * ((row['CBBCsBought'] + row['CBBCsSold']) * row['ClosingPrice']) / row['EntRatio'], axis = 1)

#add the days left from the maturity date and the days past the listing date columns
trimData['DaysPastListingDate'] = (pd.to_datetime(trimData['TradeDate']) - pd.to_datetime(trimData['ListingDate'])).dt.days
trimData['DaysFromMaturityDate'] = (pd.to_datetime(trimData['MaturityDate']) - pd.to_datetime(trimData['TradeDate'])).dt.days
trimData['MaturityDaysFromIssuance'] = (pd.to_datetime(trimData['MaturityDate']) - pd.to_datetime(trimData['ListingDate'])).dt.days

# spread
trimData['Spread'] = trimData['AveragePriceSold'] - trimData['AveragePriceBought']
trimData['Spread'] = trimData['Spread'].abs()

# call level relative to strike level
trimData['callVsStrike'] = (trimData['CallLevel'] - trimData['StrikeLevel']) / trimData['StrikeLevel']
trimData['callVsStrike'] = trimData['callVsStrike'].abs()

# print
display(trimData.head())

Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,OutstandingPct,TotalIssueSize,TradingCurrency,DayHigh,DayLow,ClosingPrice,Volume,Turnover,Issuer,Underlying,BullBear,CBBCType,CBBCCategory,ListingDate,LastTradingDate,MaturityDate,MCE,Strike_CallCurrency,StrikeLevel,CallLevel,EntRatio,DelistingDate,ldt,Unnamed: 33,Unnamed: 34,target,DaysPastListingDate,DaysFromMaturityDate,MaturityDaysFromIssuance,Spread,callVsStrike
0,2019-05-02,57003.hk,57003,UB#HSI RC2010B,5/2/2019,0.0,0.0,0.0,0.0,20000.0,0.01,200000000.0,HKD,0.0,0.0,0.45,0.0,0.0,UB,HSI,Bull,Standard,R,11/6/2018,0,10/29/2020,N,-,24038.0,24138.0,12000,0,2019-05-03D19:30:13.011137000,0.0,0,-0.0,177,546,723,0.0,0.00416
1,2019-05-02,57008.hk,57008,GS#TENCTRC1906O,5/2/2019,100000.0,0.53,0.0,0.0,9320000.0,9.32,100000000.0,HKD,0.53,0.53,0.53,100000.0,53000.0,GS,700,Bull,Standard,R,3/7/2019,0,6/28/2019,N,HKD,336.98,338.98,100,0,2019-05-03D19:30:13.011137000,0.0,0,-530.0,56,57,113,0.53,0.005935
2,2019-05-02,57010.hk,57010,BP#TENCTRC1907T,5/2/2019,100000.0,0.57,-100000.0,-0.57,1170000.0,2.34,50000000.0,HKD,0.57,0.57,0.59,200000.0,114000.0,BP,700,Bull,Standard,R,3/7/2019,0,7/30/2019,N,HKD,332.5,335.0,100,0,2019-05-03D19:30:13.011137000,0.0,0,-0.0,56,89,145,1.14,0.007519
3,2019-05-02,57016.hk,57016,JP#HSI RC2009V,5/2/2019,0.0,0.0,0.0,0.0,0.0,0.0,200000000.0,HKD,0.0,0.0,0.47,0.0,0.0,JP,HSI,Bull,Standard,R,11/6/2018,0,9/29/2020,N,-,24458.0,24558.0,10000,0,2019-05-03D19:30:13.011137000,0.0,0,-0.0,177,516,693,0.0,0.004089
4,2019-05-02,57017.hk,57017,JP#HSI RC2010E,5/2/2019,0.0,0.0,0.0,0.0,120000.0,0.06,200000000.0,HKD,0.0,0.0,0.57,0.0,0.0,JP,HSI,Bull,Standard,R,11/6/2018,0,10/29/2020,N,-,23358.0,23458.0,10000,0,2019-05-03D19:30:13.011137000,0.0,0,-0.0,177,546,723,0.0,0.004281


In [185]:
# feature aggregation
trainData = trimData.groupby(
    [
        'CBBCCode',
        'TotalIssueSize',
        'Underlying',
        'BullBear',
        'StrikeLevel',
        'callVsStrike',
        'EntRatio',
        'MaturityDaysFromIssuance'
    ], as_index = True
).agg(
    {
         # find the first trade date
        'TradeDate': [min],
        'TotalIssueSize': 'first',
        'Underlying': 'first',
        'BullBear': 'first',
        'StrikeLevel': 'first',
        'callVsStrike': 'first',
        'EntRatio': 'first',
        'MaturityDaysFromIssuance': 'first',
         # target as time series
        'target': lambda x: list(x)
    }
)
trainData.columns = ["_".join(x) for x in trainData.columns.ravel()]

# column renaming
trainData.rename(
    {
        'TradeDate_min': 'start',
        'target_<lambda>': 'target',
        'TotalIssueSize_first': 'TotalIssueSize',
        'Underlying_first': 'Underlying',
        'BullBear_first': 'BullBear',
        'StrikeLevel_first': 'StrikeLevel',
        'callVsStrike_first': 'callVsStrike',
        'EntRatio_first': 'EntRatio',
        'MaturityDaysFromIssuance_first': 'MaturityDaysFromIssuance'
    },
    axis=1, inplace=True
)

def categoriseTotalIssueSize(row):
    return math.floor(row['TotalIssueSize'] / math.pow(10, 9) * 2)

def categoriseUnderlying(row):
    return 1 if row['Underlying'] == 'HSI' else 0

def categoriseBullBear(row):
    return 1 if row['BullBear'] == 'Bull' else 0

def categoriseStrike(row):
    if row['StrikeLevel'] < 500:
        return math.floor(row['StrikeLevel'] / 100)
    elif row['StrikeLevel'] < 10000:
        return 5
    else:
        return 4 + math.floor(row['StrikeLevel']/5000)
    
def categoriseCallVstrike(row):
    if row['callVsStrike'] < 15:
        return math.floor(row['callVsStrike'] / 2.5)
    else:
        return 3 + math.floor(row['callVsStrike']/5)
    
def categoriseEntRatio(row):
    if row['EntRatio'] < 1000:
        return math.floor(row['EntRatio'] / 500)
    elif row['EntRatio'] < 10000:
        return 2
    elif row['EntRatio'] < 17500:
        return math.floor(row['EntRatio']/2500) - 1
    else:
        return 6

def categoriseMaturityPeriod(row):
    if row['MaturityDaysFromIssuance'] < 100:
        return 0
    elif row['MaturityDaysFromIssuance'] < 300:
        return math.floor(row['MaturityDaysFromIssuance'] / 50) - 1
    else:
        return math.floor(row['MaturityDaysFromIssuance']/100) + 2


# convert the grouped columns into categories ie. values from 0 to x
trainData['TotalIssueSize'].fillna(0, inplace=True)
trainData['TotalIssueSize'] = trainData.apply(categoriseTotalIssueSize, axis = 1)

trainData['Underlying'] = trainData.apply(categoriseUnderlying, axis = 1)

trainData['BullBear'] = trainData.apply(categoriseBullBear, axis = 1)

trainData['StrikeLevel'] = trainData.apply(categoriseStrike, axis = 1)

trainData['callVsStrike'] = trainData.apply(categoriseCallVstrike, axis = 1)

trainData['EntRatio'] = trainData.apply(categoriseEntRatio, axis=1)

trainData['MaturityDaysFromIssuance'] = trainData.apply(categoriseMaturityPeriod, axis=1)


# convert all the grouped categories into the arry
trainData['cat'] = trainData.apply(lambda row: [
    row['TotalIssueSize'], row['Underlying'], row['BullBear'], row['StrikeLevel'],
    row['callVsStrike'], row['EntRatio'], row['MaturityDaysFromIssuance']
], axis = 1)

trainData = trainData.fillna(0)
# drop the grouped columns
trainData.drop(columns = [
    'TotalIssueSize',
    'Underlying',
    'BullBear',
    'StrikeLevel',
    'callVsStrike',
    'EntRatio',
    'MaturityDaysFromIssuance'
], inplace=True)

# date transformation
trainData['start'] = pd.to_datetime(trainData['start'])
trainData['start'] = trainData['start'].dt.strftime('%Y-%m-%d %H:%M:%S')

trainData.info()
trainData.iloc[[890, 3243, 2754, 275, 3582, 327, 360, 2586, 1033, 2045, 908, 86, 1793, 645, 1606, 3547, 1884, 2736, 1973, 255, 211, 3300, 3600, 1605, 2101, 2877, 3431, 29, 1324, 2506, 3438, 32, 1767, 3816, 623, 1711, 2171, 104, 1304, 2965, 309, 3766, 3718, 2296, 1714, 3815, 4054, 3447, 2500, 1172]]

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7242 entries, (50000, 200000000.0, HSI, Bear, 29058.0, 0.003441393075917131, 12000, 154) to (69998, 120000000.0, HSI, Bull, 23958.0, 0.004173971116119876, 10000, 437)
Data columns (total 3 columns):
start     7242 non-null object
target    7242 non-null object
cat       7242 non-null object
dtypes: object(3)
memory usage: 336.7+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,start,target,cat
CBBCCode,TotalIssueSize,Underlying,BullBear,StrikeLevel,callVsStrike,EntRatio,MaturityDaysFromIssuance,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
58478,120000000.0,HSI,Bull,27758.0,0.003603,10000,648,2019-05-10 00:00:00,"[-0.0, -0.0, -41.44, -3.84, -0.0, 7.353, -8.19...","[0, 1, 1, 9, 0, 3, 8]"
62381,120000000.0,HSI,Bull,24958.0,0.004007,10000,687,2019-05-10 00:00:00,"[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0....","[0, 1, 1, 8, 0, 3, 8]"
61430,0.0,HSI,Bull,25805.0,0.003875,10000,682,2019-06-13 00:00:00,"[-0.0, -0.0]","[0, 1, 1, 9, 0, 3, 8]"
57002,0.0,HSI,Bear,27488.0,0.003638,12000,259,2019-06-13 00:00:00,"[-0.0, -0.0]","[0, 1, 0, 9, 0, 3, 4]"
63085,0.0,HSI,Bull,25300.0,0.003953,10000,686,2019-06-13 00:00:00,"[-0.0, -0.0]","[0, 1, 1, 9, 0, 3, 8]"
57095,250000000.0,HSI,Bull,29008.0,0.003447,10000,736,2019-05-02 00:00:00,"[17.891999999999996, 8.363999999999999, 3.82]","[0, 1, 1, 9, 0, 3, 9]"
57189,0.0,700,Bear,427.5,0.005848,100,217,2019-06-13 00:00:00,"[-0.0, -0.0]","[0, 0, 0, 4, 0, 0, 3]"
61157,200000000.0,HSI,Bear,27638.0,0.003618,10000,285,2019-06-19 00:00:00,[-0.0],"[0, 1, 0, 9, 0, 3, 4]"
58705,200000000.0,HSI,Bull,28258.0,0.003539,10000,891,2019-05-21 00:00:00,[-0.0],"[0, 1, 1, 9, 0, 3, 10]"
60245,0.0,HSI,Bear,28348.0,0.003528,10000,161,2019-06-13 00:00:00,"[-0.0, -0.0]","[0, 1, 0, 9, 0, 3, 2]"


In [184]:
# test = trainData[:2]
import json
import sagemaker
sagemaker_session = sagemaker.Session()
predictor = sagemaker.predictor.RealTimePredictor('hackathonEndPoint')
request = json.dumps({"instances": trainData.to_dict(orient='records'), 
"configuration": {"output_types": ["mean"], "num_samples": 100}})
# print(request)
# request = json.dumps({"instances": [
#     {
#         "start": "2019-05-10 00:00:00",
#         "target": [-2117.5000000000005, 295.0, -630.0, -0.0, 1953.0, -200.0, -0.0, -44.5, 2268.0000000000005, -0.0, -56.00000000000001, -0.0, -0.0, -0.0],
#         "cat": [0]
#     }, {
#         "start": "2019-05-10 00:00:00",
#         "target": [670.0, -840.0, -0.0, -0.0],
#         "cat": [0]
#     }
# ], "configuration": {"output_types": ["mean"], "num_samples": 100}})
result = predictor.predict(request)
print(result)

b'{"predictions":[{"mean":[-0.0787139013,0.000854061,-0.076974228,-0.0756392404,0.0239129756]},{"mean":[451.0969848633,-475.852935791,-1225.5933837891,377.3759460449,161.0537109375]},{"mean":[225.2730407715,-258.4789428711,302.2981872559,-548.7396850586,67.1028594971]},{"mean":[-0.2762190998,-0.6410415173,-2.4402461052,-0.3980470002,-1.1821898222]},{"mean":[0.1152480692,-0.0297712684,-0.175951004,-1.052995801,1.1475377083]},{"mean":[0.0011363761,-0.0047468115,-0.0032654419,-0.00233012,-0.0014936881]},{"mean":[-0.9931823611,-1.0237674713,-0.4619675577,-1.0960855484,-1.259750843]},{"mean":[-0.6540664434,0.1655372828,0.1733155698,-0.2352447361,0.430518806]},{"mean":[-1.1145169735,-0.8199790716,-1.0307254791,-1.2262681723,-1.1244331598]},{"mean":[0.0001566287,-0.0008517688,-0.001264903,-0.0007266037,-0.0001557987]},{"mean":[4.9458479881,-11.0298728943,48.9676704407,-95.7456741333,167.0205078125]},{"mean":[-1.6396173239,-1.5691417456,-1.3048442602,0.4033091366,-2.9593417645]},{"mean":[-0.19

In [None]:
max_pred = 0
for i in result.predictions:
    

In [119]:
import json
import sagemaker

In [120]:
sagemaker_session = sagemaker.Session()
predictor = sagemaker.predictor.RealTimePredictor('predict')

In [127]:
request = json.dumps({"instances": [ {"start": "2016-01-15 00:00:00", "cat": [1], "target": [8.371208085491508, 8.38437885371535, 8.860699073980985, 8.047195011672134, 9.42771383264719, 8.02120332304575, 9.839234913116105, 9.237618947392374, 8.214949470821212, 9.814497679561292, 9.052164695305954, 8.102437854966766, 8.928941871965348, 9.844116398312188, 9.221646100693144, 8.853571486995326, 8.560903044968434, 8.240263518568812, 9.221323908588538, 9.448381346299827, 9.996678314417732, 8.520757726306975, 9.978841260562627, 9.196420806291513, 9.587904493744922, 9.367880938747199, 9.606228859687628, 9.277298500001638, 8.694011829622228, 8.264125277439893]}], 
"configuration": {"output_types": ["mean"], "num_samples": 100}})
print(request)

{"instances": [{"start": "2016-01-15 00:00:00", "cat": [1], "target": [8.371208085491508, 8.38437885371535, 8.860699073980985, 8.047195011672134, 9.42771383264719, 8.02120332304575, 9.839234913116105, 9.237618947392374, 8.214949470821212, 9.814497679561292, 9.052164695305954, 8.102437854966766, 8.928941871965348, 9.844116398312188, 9.221646100693144, 8.853571486995326, 8.560903044968434, 8.240263518568812, 9.221323908588538, 9.448381346299827, 9.996678314417732, 8.520757726306975, 9.978841260562627, 9.196420806291513, 9.587904493744922, 9.367880938747199, 9.606228859687628, 9.277298500001638, 8.694011829622228, 8.264125277439893]}], "configuration": {"output_types": ["mean"], "num_samples": 100}}


In [122]:
prediction = predictor.predict(request)
print(prediction)

b'{"predictions":[{"mean":[1.1462774277,0.7114387751]}]}'


## Preparing Test Data

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from math import sqrt
import sys
import boto3
from sagemaker import get_execution_role
# role = get_execution_role()
# bucket='innovationday-467664929633/OutOfSample'
# data_location = 's3://{}'.format(bucket)
# #cbbc_daily_summary.csv
# cbbc_daily_summary = pd.read_csv(f'{data_location}/cbbc_daily_summary_OOS.csv')
# #UnderlierIntradayOHLC.csv
# underlier_intraday = pd.read_csv(f'{data_location}/UnderlierIntradayOHLC_OOS.csv')
# #productTurnOver.csv
# product_turn_over = pd.read_csv(f'{data_location}/productTurnOver_OOS.csv')
# #underlying_stats.csv
# underlying_stats = pd.read_csv(f'{data_location}/underlying_stats_OOS.csv')



Columns (34) have mixed types. Specify dtype option on import or set low_memory=False.



FileNotFoundError: innovationday-467664929633/OutOfSample/productTurnOver_OOS.csv

## Join Tables

In [95]:
def make_frame(filename):
    role = get_execution_role()
    bucket='innovationday-467664929633'
    data_location = 's3://{}'.format(bucket)
    return pd.read_csv(f'{data_location}/{filename}')

In [96]:
#change file names for outOfSample
# iidata = make_frame("cbbc_daily_summary.csv")
iidata = make_frame("OutOfSample/cbbc_daily_summary_OOS.csv")
iidata = iidata.loc[:,~iidata.columns.str.match('Unnamed')]

# intraday = make_frame("UnderlierIntradayOHLC.csv")
intraday = make_frame("OutOfSample/UnderlierIntradayOHLC_OOS.csv")

# underlier = make_frame("underlying_stats.csv")
underlier = make_frame("OutOfSample/underlying_statsOOS.csv")


Columns (34) have mixed types. Specify dtype option on import or set low_memory=False.



In [97]:
#filter non important ones
iidata = iidata[iidata['MCE'] == 'N']
iidata = iidata[iidata['ClosingPrice'].notnull()]
iidata = iidata[iidata['Turnover'] != 0]

#Clean data
underlier['date'] = underlier['vd'].map(lambda x: x.replace(".", "-"))
intraday['date'] = intraday['time'].map(lambda x: x[:10])
intraday['time'] = intraday['time'].map(lambda x: x[11:])
intraday['index'] = intraday['id'].map(lambda x: ".HSI" if x == 'hsi' else '0700.HK')
iidata["index"] = iidata["Underlying"].map(lambda x: ".HSI" if x == 'hsi' else '0700.HK')
iiodata = iidata.merge(underlier, left_on = ['index', 'date'], right_on = ['id','date'])
output = iiodata.merge(intraday, left_on = ['index', 'date'], right_on = ['index','date'])

In [98]:
output.head()

Unnamed: 0,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,OutstandingPct,...,rawvs,id_y,time,op,lp,hp,ccp,intvs,date,index


In [103]:
iidata.columns

Index(['date', 'sym', 'CBBCCode', 'CBBCName', 'TradeDate', 'CBBCsBought',
       'AveragePriceBought', 'CBBCsSold', 'AveragePriceSold', 'Outstanding',
       'OutstandingPct', 'TotalIssueSize', 'TradingCurrency', 'DayHigh',
       'DayLow', 'ClosingPrice', 'Volume', 'Turnover', 'Issuer', 'Underlying',
       'BullBear', 'CBBCType', 'CBBCCategory', 'ListingDate',
       'LastTradingDate', 'MaturityDate', 'MCE', 'Strike_CallCurrency',
       'StrikeLevel', 'CallLevel', 'EntRatio', 'DelistingDate', 'ldt',
       'index'],
      dtype='object')

In [76]:
temp = make_frame("cbbc_daily_summary.csv")
train_cols  = temp.columns
temp2 = make_frame("OutOfSample/cbbc_daily_summary_OOS.csv")
sample_cols = temp2.columns


Columns (34) have mixed types. Specify dtype option on import or set low_memory=False.



In [87]:
zz = make_frame("OutOfSample/cbbc_daily_summary_OOS.csv")
zz.loc[:,~zz.columns.str.match('Unnamed')]


Columns (34) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,...,ListingDate,LastTradingDate,MaturityDate,MCE,Strike_CallCurrency,StrikeLevel,CallLevel,EntRatio,DelistingDate,ldt
0,5/2/2019,57003.hk,57003,UB#HSI RC2010B,5/2/2019,0.0,0.000000,0.0,0.000000,20000.0,...,11/6/2018,,10/29/2020,N,-,24038.00,24138.00,12000,,2019-05-03D19:30:13.011137000
1,5/2/2019,57008.hk,57008,GS#TENCTRC1906O,5/2/2019,100000.0,0.530000,0.0,0.000000,9320000.0,...,3/7/2019,,6/28/2019,N,HKD,336.98,338.98,100,,2019-05-03D19:30:13.011137000
2,5/2/2019,57010.hk,57010,BP#TENCTRC1907T,5/2/2019,100000.0,0.570000,-100000.0,-0.570000,1170000.0,...,3/7/2019,,7/30/2019,N,HKD,332.50,335.00,100,,2019-05-03D19:30:13.011137000
3,5/2/2019,57016.hk,57016,JP#HSI RC2009V,5/2/2019,0.0,0.000000,0.0,0.000000,0.0,...,11/6/2018,,9/29/2020,N,-,24458.00,24558.00,10000,,2019-05-03D19:30:13.011137000
4,5/2/2019,57017.hk,57017,JP#HSI RC2010E,5/2/2019,0.0,0.000000,0.0,0.000000,120000.0,...,11/6/2018,,10/29/2020,N,-,23358.00,23458.00,10000,,2019-05-03D19:30:13.011137000
5,5/2/2019,57019.hk,57019,JP#HSI RC2009Z,5/2/2019,0.0,0.000000,0.0,0.000000,10000.0,...,11/6/2018,,9/29/2020,N,-,23158.00,23258.00,10000,,2019-05-03D19:30:13.011137000
6,5/2/2019,57023.hk,57023,JP#HSI RC2010F,5/2/2019,0.0,0.000000,0.0,0.000000,0.0,...,11/6/2018,,10/29/2020,N,-,22958.00,23058.00,10000,,2019-05-03D19:30:13.011137000
7,5/2/2019,57025.hk,57025,JP#HSI RC2009W,5/2/2019,0.0,0.000000,0.0,0.000000,0.0,...,11/6/2018,,9/29/2020,N,-,22858.00,22958.00,10000,,2019-05-03D19:30:13.011137000
8,5/2/2019,57027.hk,57027,JP#HSI RC2010C,5/2/2019,0.0,0.000000,0.0,0.000000,0.0,...,11/6/2018,,10/29/2020,N,-,22658.00,22758.00,10000,,2019-05-03D19:30:13.011137000
9,5/2/2019,57028.hk,57028,JP#HSI RC2011S,5/2/2019,30000.0,0.790000,0.0,0.000000,50000.0,...,11/6/2018,,11/27/2020,N,-,20900.00,21000.00,10000,,2019-05-03D19:30:13.011137000


In [85]:
temp2.head()

Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,...,MaturityDate,MCE,Strike_CallCurrency,StrikeLevel,CallLevel,EntRatio,DelistingDate,ldt,Unnamed: 33,Unnamed: 34
0,5/2/2019,57003.hk,57003,UB#HSI RC2010B,5/2/2019,0.0,0.0,0.0,0.0,20000.0,...,10/29/2020,N,-,24038.0,24138.0,12000,,2019-05-03D19:30:13.011137000,,
1,5/2/2019,57008.hk,57008,GS#TENCTRC1906O,5/2/2019,100000.0,0.53,0.0,0.0,9320000.0,...,6/28/2019,N,HKD,336.98,338.98,100,,2019-05-03D19:30:13.011137000,,
2,5/2/2019,57010.hk,57010,BP#TENCTRC1907T,5/2/2019,100000.0,0.57,-100000.0,-0.57,1170000.0,...,7/30/2019,N,HKD,332.5,335.0,100,,2019-05-03D19:30:13.011137000,,
3,5/2/2019,57016.hk,57016,JP#HSI RC2009V,5/2/2019,0.0,0.0,0.0,0.0,0.0,...,9/29/2020,N,-,24458.0,24558.0,10000,,2019-05-03D19:30:13.011137000,,
4,5/2/2019,57017.hk,57017,JP#HSI RC2010E,5/2/2019,0.0,0.0,0.0,0.0,120000.0,...,10/29/2020,N,-,23358.0,23458.0,10000,,2019-05-03D19:30:13.011137000,,


In [53]:
iiodata.head()

Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,...,Unnamed: 33,Unnamed: 34,index,id,vd,rawop,rawhp,rawlp,rawtp,rawvs


In [61]:
iidata.merge(underlier, left_on = ['index', 'date'], right_on = ['id','date'])

Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,...,DelistingDate,ldt,index,id,vd,rawop,rawhp,rawlp,rawtp,rawvs
0,2016-01-04,60018.hk,60018,CS#HSI RP1601Y,2016-01-04,0.000000e+00,0.000000,-2.000000e+04,-0.225500,90490000.0,...,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
1,2016-01-04,60023.hk,60023,CS#HSI RP1602C,2016-01-04,8.300000e+05,0.183771,-5.100000e+05,-0.197137,130960000.0,...,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
2,2016-01-04,60046.hk,60046,SG#HSI RP1602Z,2016-01-04,1.000000e+04,0.249000,-2.800000e+05,-0.246714,290000.0,...,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
3,2016-01-04,60051.hk,60051,SG#HSI RP1602A,2016-01-04,2.670000e+06,0.227225,-1.620000e+06,-0.219426,1210000.0,...,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
4,2016-01-04,60122.hk,60122,UB#HSI RP1601H,2016-01-04,1.000000e+04,0.250000,-1.100000e+05,-0.230000,4560000.0,...,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
5,2016-01-04,60123.hk,60123,UB#HSI RP1601Q,2016-01-04,3.100000e+05,0.274355,-1.100000e+05,-0.245000,1370000.0,...,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
6,2016-01-04,60135.hk,60135,BP#HSI RP1603K,2016-01-04,1.780000e+06,0.242433,-1.000000e+04,-0.241000,1610000.0,...,2016-03-31,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
7,2016-01-04,60136.hk,60136,BP#HSI RP1604H,2016-01-04,2.300000e+05,0.237435,-2.800000e+05,-0.265001,940000.0,...,2016-04-29,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
8,2016-01-04,60139.hk,60139,BP#HSI RP1603L,2016-01-04,7.080000e+06,0.254149,-2.000000e+06,-0.250000,1090000.0,...,2016-03-31,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213
9,2016-01-04,60157.hk,60157,HS#HSI RP1608D,2016-01-04,2.750000e+06,0.210396,-1.000000e+05,-0.217000,12710000.0,...,,2019-03-11D19:21:06.181120000,0700.HK,0700.HK,2016.01.04,151.4,152.0,148.7,149.1,16047213


In [60]:
iidata

Unnamed: 0,date,sym,CBBCCode,CBBCName,TradeDate,CBBCsBought,AveragePriceBought,CBBCsSold,AveragePriceSold,Outstanding,...,LastTradingDate,MaturityDate,MCE,Strike_CallCurrency,StrikeLevel,CallLevel,EntRatio,DelistingDate,ldt,index
0,2016-01-04,60018.hk,60018,CS#HSI RP1601Y,2016-01-04,0.0,0.000000,-20000.0,-0.225500,90490000.0,...,2016-01-27,2016-01-28,N,-,23800.0,23600.0,10000,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK
1,2016-01-04,60023.hk,60023,CS#HSI RP1602C,2016-01-04,830000.0,0.183771,-510000.0,-0.197137,130960000.0,...,2016-02-25,2016-02-26,N,-,23725.0,23475.0,12000,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK
3,2016-01-04,60046.hk,60046,SG#HSI RP1602Z,2016-01-04,10000.0,0.249000,-280000.0,-0.246714,290000.0,...,2016-02-25,2016-02-26,N,-,23788.0,23688.0,10000,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK
4,2016-01-04,60051.hk,60051,SG#HSI RP1602A,2016-01-04,2670000.0,0.227225,-1620000.0,-0.219426,1210000.0,...,2016-02-25,2016-02-26,N,-,23588.0,23488.0,10000,2016-02-29,2019-03-11D19:21:06.181120000,0700.HK
6,2016-01-04,60122.hk,60122,UB#HSI RP1601H,2016-01-04,10000.0,0.250000,-110000.0,-0.230000,4560000.0,...,2016-01-27,2016-01-28,N,-,23850.0,23650.0,10000,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK
7,2016-01-04,60123.hk,60123,UB#HSI RP1601Q,2016-01-04,310000.0,0.274355,-110000.0,-0.245000,1370000.0,...,2016-01-27,2016-01-28,N,-,24000.0,23800.0,10000,2016-01-29,2019-03-11D19:21:06.181120000,0700.HK
8,2016-01-04,60135.hk,60135,BP#HSI RP1603K,2016-01-04,1780000.0,0.242433,-10000.0,-0.241000,1610000.0,...,2016-03-29,2016-03-30,N,-,23700.0,23500.0,10000,2016-03-31,2019-03-11D19:21:06.181120000,0700.HK
9,2016-01-04,60136.hk,60136,BP#HSI RP1604H,2016-01-04,230000.0,0.237435,-280000.0,-0.265001,940000.0,...,2016-04-27,2016-04-28,N,-,23800.0,23600.0,10000,2016-04-29,2019-03-11D19:21:06.181120000,0700.HK
11,2016-01-04,60139.hk,60139,BP#HSI RP1603L,2016-01-04,7080000.0,0.254149,-2000000.0,-0.250000,1090000.0,...,2016-03-29,2016-03-30,N,-,23900.0,23700.0,10000,2016-03-31,2019-03-11D19:21:06.181120000,0700.HK
12,2016-01-04,60157.hk,60157,HS#HSI RP1608D,2016-01-04,2750000.0,0.210396,-100000.0,-0.217000,12710000.0,...,,2016-08-30,N,-,24188.0,23988.0,15000,,2019-03-11D19:21:06.181120000,0700.HK


## Writing Merged dataframe to S3

In [67]:
from io import StringIO # python3; python2: BytesIO 
import boto3

csv_buffer = StringIO()
output.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object('innovationday-467664929633', 'mergedTable.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'CF9FEE55AEFCC9AE',
  'HostId': '02GGnN+lHy6SqwRsMuXATWomSy8m8IPO3yBgY/EI9wsh/0DzgNvfOP2cjvcoBOZS8uYnLEiMo2o=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '02GGnN+lHy6SqwRsMuXATWomSy8m8IPO3yBgY/EI9wsh/0DzgNvfOP2cjvcoBOZS8uYnLEiMo2o=',
   'x-amz-request-id': 'CF9FEE55AEFCC9AE',
   'date': 'Wed, 17 Jul 2019 06:44:10 GMT',
   'etag': '"755bb7da8c60495d0c051c2db48ad006"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"755bb7da8c60495d0c051c2db48ad006"'}