# Setup the Env

In [1]:
import sys
import os

import sagemaker
from sagemaker import get_execution_role

# Add the parent directory to the sys.path
sys.path.insert(0, os.path.abspath(".."))

# Define IAM role
role = get_execution_role()
role

# Establish S3 bucket connection
import boto3

s3 = boto3.client("s3")
bucket = "capstone-bucket-4-friends"

# Take a look at current dir
print(os.getcwd())

from file_utilities import s3_download

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
/home/sagemaker-user/capstone-2024-summer/src/rachel


# Import Libraries

In [2]:
# standard libraries
import numpy as np
import pandas as pd
import calendar

# visualization
import matplotlib.pyplot as plt
!pip install seaborn -q
import seaborn as sns

# model
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

# to save the data as pickle
import pickle

2024-07-24 02:35:13.568497: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Dataset

In [3]:
train_df = pd.read_parquet("/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_train.parquet")
val_df = pd.read_parquet("/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_val.parquet")

train_df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,...,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
0,2018-01-10,10104,ORACLE CORP,N,51,0.33319,1.0,0.0,0.16283,0.140955,...,1.0,0.3568959,0.948902,0.279803,0.585647,0.99261,0.75,0.933013,0.617495,0.985999
1,2018-01-11,10104,ORACLE CORP,N,51,0.359834,1.0,0.0,0.135664,0.143305,...,0.615957,5.5511150000000004e-17,0.895388,0.193947,0.594114,0.991063,0.75,0.933013,0.617495,0.985999
2,2018-01-12,10104,ORACLE CORP,N,51,0.386453,1.0,0.0,0.200822,0.152075,...,0.0,0.0,0.825686,0.120621,0.602553,0.98937,0.75,0.933013,0.617495,0.985999
3,2018-01-16,10104,ORACLE CORP,N,51,0.355195,1.0,0.0,0.218657,0.153328,...,0.862937,0.8019377,0.449416,0.002565,0.63598,0.981155,0.75,0.933013,0.674177,0.968682
4,2018-01-17,10104,ORACLE CORP,N,51,0.393682,1.0,0.0,0.310242,0.163978,...,1.0,0.3568959,0.350318,0.02293,0.644243,0.978743,0.75,0.933013,0.674177,0.968682


In [4]:
train_df.columns

Index(['date', 'permno_id', 'company_name', 'primary_exchange', 'naics_sector',
       'return_scaled', 'shares_outstanding_scaled', 'num_trades_scaled',
       'volume_scaled', 'close_price_scaled', 'market_cap_scaled',
       'volatility_7_scaled', 'sector_weighted_avg_return_scaled',
       'sector_simple_avg_return_scaled', 'day_of_week_x', 'day_of_week_y',
       'day_of_month_x', 'day_of_month_y', 'day_of_year_x', 'day_of_year_y',
       'month_of_year_x', 'month_of_year_y', 'week_of_year_x',
       'week_of_year_y'],
      dtype='object')

# Select only 200 permnos

In [5]:
np.random.seed(1234)

permnos_ids = np.random.choice(train_df['permno_id'].unique(), size=200, replace=False)
display(permnos_ids)

train_df = train_df[train_df['permno_id'].isin(permnos_ids)]
val_df = val_df[val_df['permno_id'].isin(permnos_ids)]

array(['25785', '21178', '11674', '12084', '92157', '24205', '76226',
       '18143', '14277', '14882', '17700', '84381', '52978', '12558',
       '22206', '59328', '80286', '48725', '89866', '84262', '75694',
       '52230', '87541', '22103', '85663', '13964', '77178', '42200',
       '15408', '30681', '58683', '16649', '15488', '17307', '87055',
       '89626', '43350', '19751', '23393', '87137', '77037', '89070',
       '18411', '44329', '80320', '19393', '92614', '87445', '18312',
       '63467', '12476', '86783', '42906', '92221', '86996', '90227',
       '86339', '67598', '43449', '22779', '89017', '42585', '14702',
       '85631', '18940', '11403', '79323', '53613', '91103', '19561',
       '90454', '87432', '83443', '77702', '81736', '24053', '84769',
       '81774', '79678', '14579', '90880', '86964', '11762', '12308',
       '52695', '59300', '66157', '16678', '23579', '84129', '13407',
       '91277', '14714', '66384', '88436', '21371', '76795', '79785',
       '75828', '862

# Prepare the data by WINDOW_SIZE

In [6]:
WINDOW_SIZE = 128
PERMNO_IDs = train_df['permno_id'].unique()

In [7]:
NUMERICALS = ['return_scaled', 'shares_outstanding_scaled', 'num_trades_scaled',
       'volume_scaled', 'close_price_scaled', 'market_cap_scaled',
       'volatility_7_scaled', 'sector_weighted_avg_return_scaled',
       'sector_simple_avg_return_scaled', 'day_of_week_x', 'day_of_week_y',
       'day_of_month_x', 'day_of_month_y', 'day_of_year_x', 'day_of_year_y',
       'month_of_year_x', 'month_of_year_y', 'week_of_year_x',
       'week_of_year_y']

CATEGORICALS = ['permno_id', 'company_name', 'primary_exchange', 'naics_sector']

In [8]:
# Prepare the data using rolling window
def prepare_data(df, window_size):
    numerical_x = {}
    categorical_x = {}
    y = []
    for permno in PERMNO_IDs:
      df_permno = df[df['permno_id'] == permno]
      for i in range(len(df_permno) - window_size):
        for col in NUMERICALS:
          if col not in numerical_x.keys():
            numerical_x[col] = []
          numerical_x[col].append(df_permno.iloc[i:i + window_size][col].tolist()) # window_size features
        for col in CATEGORICALS:
          if col not in categorical_x.keys():
            categorical_x[col] = []
          categorical_x[col].append(df_permno.iloc[i + window_size - 1][col]) # use the category closest to the target
        y.append(df_permno.iloc[i + window_size]['return_scaled']) # next day target

    for col in numerical_x:
      numerical_x[col] = np.array(numerical_x[col])
      numerical_x[col] = numerical_x[col].reshape((numerical_x[col].shape[0], numerical_x[col].shape[1], 1))

    for col in categorical_x:
      categorical_x[col] = np.array(categorical_x[col])
      categorical_x[col] = categorical_x[col].reshape((categorical_x[col].shape[0], 1))

    y = np.array(y)
    y = y.reshape((y.shape[0], 1))

    return numerical_x, categorical_x, y

train_numerical_x, train_categorical_x, train_y = prepare_data(train_df, WINDOW_SIZE)
val_numerical_x, val_categorical_x, val_y = prepare_data(val_df, WINDOW_SIZE)

In [9]:
print("---- training data ----")
print("--> num_numerical_features:", len(train_numerical_x.keys()))
for key in train_numerical_x.keys():
  print(key, train_numerical_x[key].shape)
print("--> num_categorical_features:", len(train_categorical_x.keys()))
for key in train_categorical_x.keys():
  print(key, train_categorical_x[key].shape)
print("--> target_shape:", train_y.shape)

---- training data ----
--> num_numerical_features: 19
return_scaled (219323, 128, 1)
shares_outstanding_scaled (219323, 128, 1)
num_trades_scaled (219323, 128, 1)
volume_scaled (219323, 128, 1)
close_price_scaled (219323, 128, 1)
market_cap_scaled (219323, 128, 1)
volatility_7_scaled (219323, 128, 1)
sector_weighted_avg_return_scaled (219323, 128, 1)
sector_simple_avg_return_scaled (219323, 128, 1)
day_of_week_x (219323, 128, 1)
day_of_week_y (219323, 128, 1)
day_of_month_x (219323, 128, 1)
day_of_month_y (219323, 128, 1)
day_of_year_x (219323, 128, 1)
day_of_year_y (219323, 128, 1)
month_of_year_x (219323, 128, 1)
month_of_year_y (219323, 128, 1)
week_of_year_x (219323, 128, 1)
week_of_year_y (219323, 128, 1)
--> num_categorical_features: 4
permno_id (219323, 1)
company_name (219323, 1)
primary_exchange (219323, 1)
naics_sector (219323, 1)
--> target_shape: (219323, 1)


In [10]:
print("---- validation data ----")
print("--> num_numerical_features:", len(val_numerical_x.keys()))
for key in val_numerical_x.keys():
  print(key, val_numerical_x[key].shape)
print("--> num_categorical_features:", len(val_categorical_x.keys()))
for key in val_categorical_x.keys():
  print(key, val_categorical_x[key].shape)
print("--> target_shape:", val_y.shape)

---- validation data ----
--> num_numerical_features: 19
return_scaled (49498, 128, 1)
shares_outstanding_scaled (49498, 128, 1)
num_trades_scaled (49498, 128, 1)
volume_scaled (49498, 128, 1)
close_price_scaled (49498, 128, 1)
market_cap_scaled (49498, 128, 1)
volatility_7_scaled (49498, 128, 1)
sector_weighted_avg_return_scaled (49498, 128, 1)
sector_simple_avg_return_scaled (49498, 128, 1)
day_of_week_x (49498, 128, 1)
day_of_week_y (49498, 128, 1)
day_of_month_x (49498, 128, 1)
day_of_month_y (49498, 128, 1)
day_of_year_x (49498, 128, 1)
day_of_year_y (49498, 128, 1)
month_of_year_x (49498, 128, 1)
month_of_year_y (49498, 128, 1)
week_of_year_x (49498, 128, 1)
week_of_year_y (49498, 128, 1)
--> num_categorical_features: 4
permno_id (49498, 1)
company_name (49498, 1)
primary_exchange (49498, 1)
naics_sector (49498, 1)
--> target_shape: (49498, 1)


# Put the data as one dictionary

In [11]:
train_dict = {}
train_dict['numerical_x'] = train_numerical_x
train_dict['categorical_x'] = train_categorical_x
train_dict['y'] = {'y': train_y}

val_dict = {}
val_dict['numerical_x'] = val_numerical_x
val_dict['categorical_x'] = val_categorical_x
val_dict['y'] = {'y': val_y}

merge_dict = {}
merge_dict['train_dict'] = train_dict
merge_dict['val_dict'] = val_dict

# Save the merged dictionary as pkl for future use

In [12]:
with open('/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_dict.pkl', 'wb') as file:
  pickle.dump(merge_dict, file)

In [13]:
with open('/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_dict.pkl', 'rb') as file:
  merged_df = pickle.load(file)

In [14]:
assert merge_dict.keys() == merged_df.keys()

for top_level_key in merge_dict.keys():
  assert merge_dict[top_level_key].keys() == merged_df[top_level_key].keys()
  for mid_level_key in merge_dict[top_level_key].keys():
    assert merge_dict[top_level_key][mid_level_key].keys() == merged_df[top_level_key][mid_level_key].keys()
    for bottom_level_key in merge_dict[top_level_key][mid_level_key].keys():
      assert merge_dict[top_level_key][mid_level_key][bottom_level_key].shape == merged_df[top_level_key][mid_level_key][bottom_level_key].shape
      assert np.array_equal(merge_dict[top_level_key][mid_level_key][bottom_level_key], merged_df[top_level_key][mid_level_key][bottom_level_key])

In [15]:
s3.upload_file(
    "/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_dict.pkl",
    bucket,
    "CRSP/crsp_rachel_dict.pkl",
)