# Setup the Env

In [1]:
import sys
import os

import sagemaker
from sagemaker import get_execution_role

# Add the parent directory to the sys.path
sys.path.insert(0, os.path.abspath(".."))

# Define IAM role
role = get_execution_role()
role

# Establish S3 bucket connection
import boto3

s3 = boto3.client("s3")
bucket = "capstone-bucket-4-friends"

# Take a look at current dir
print(os.getcwd())

from file_utilities import s3_download

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
/home/sagemaker-user/capstone-2024-summer/src/rachel


# Import Libraries

In [2]:
# standard libraries
import numpy as np
import pandas as pd
import calendar

# visualization
import matplotlib.pyplot as plt
!pip install seaborn -q
import seaborn as sns

# model
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

2024-07-22 19:47:22.385206: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Dataset

In [3]:
df = pd.read_parquet("/home/sagemaker-user/capstone-2024-summer/data/crsp_2018-2023_clean.parquet")

df.head()

Unnamed: 0,date,permno_id,ncusip_id,cusip_id,ticker,share_code,share_code_type,share_code_detail,exchange_code,company_name,...,close_price,return,volume,shares_outstanding,num_trades,factor_to_adjust_price,factor_to_adjust_shares,value_weighted_return,equal_weighted_return,return_on_SP_index
0,2018-01-02,10026,46603210,46603210,JJSF,11.0,1,1,3,J & J SNACK FOODS CORP,...,149.17999,-0.017454,190618,18668,1036,1.0,1.0,0.008505,0.011232,0.008303
1,2018-01-03,10026,46603210,46603210,JJSF,11.0,1,1,3,J & J SNACK FOODS CORP,...,147.69,-0.009988,63693,18668,1283,1.0,1.0,0.005856,0.005103,0.006399
2,2018-01-04,10026,46603210,46603210,JJSF,11.0,1,1,3,J & J SNACK FOODS CORP,...,149.73,0.013813,127552,18668,1138,1.0,1.0,0.004001,0.005272,0.004029
3,2018-01-05,10026,46603210,46603210,JJSF,11.0,1,1,3,J & J SNACK FOODS CORP,...,148.3,-0.00955,44647,18668,745,1.0,1.0,0.005804,0.00388,0.007034
4,2018-01-08,10026,46603210,46603210,JJSF,11.0,1,1,3,J & J SNACK FOODS CORP,...,148.41,0.000742,55014,18668,990,1.0,1.0,0.001816,0.000584,0.001662


In [4]:
df.columns

Index(['date', 'permno_id', 'ncusip_id', 'cusip_id', 'ticker', 'share_code',
       'share_code_type', 'share_code_detail', 'exchange_code', 'company_name',
       'primary_exchange', 'trading_status', 'security_status', 'naics',
       'naics_sector', 'naics_sector_name', 'naics_secondary',
       'ask_or_high_price', 'ask_price', 'bid_or_low_price', 'bid_price',
       'open_price', 'close_price', 'return', 'volume', 'shares_outstanding',
       'num_trades', 'factor_to_adjust_price', 'factor_to_adjust_shares',
       'value_weighted_return', 'equal_weighted_return', 'return_on_SP_index'],
      dtype='object')

## Keep only stocks in S&P500

In [5]:
security_df = pd.read_csv("/home/sagemaker-user/capstone-2024-summer/data/security_master.csv")
security_df.head()

Unnamed: 0,cusip,PERMNO,PERMCO,HSHRCD,DLSTCD,HTICK,HCOMNAM,HTSYMBOL,HNAICS,HPRIMEXC,...,NUMDEL,NUMNDI,BEGDAT,ENDDAT,BEGPRC,ENDPRC,BEGRET,ENDRET,BEGVOL,ENDVOL
0,00130H10,76712,10996,11,100,AES,A E S CORP,AES,221118,N,...,1,68,06/26/1991,12/29/2023,06/26/1991,12/29/2023,06/26/1991,12/29/2023,06/26/1991,12/29/2023
1,00206R10,66093,21645,11,100,T,A T & T INC,T,517312,N,...,1,0,02/16/1984,12/29/2023,02/16/1984,12/29/2023,02/16/1984,12/29/2023,02/16/1984,12/29/2023
2,00507V10,79678,12499,11,233,,ACTIVISION BLIZZARD INC,ATVI,513210,Q,...,1,1968,10/22/1993,10/12/2023,10/22/1993,10/12/2023,10/22/1993,10/12/2023,10/22/1993,10/12/2023
3,00724F10,75510,8476,11,100,ADBE,ADOBE INC,ADBE,511210,Q,...,1,2185,08/13/1986,12/29/2023,08/13/1986,12/29/2023,08/13/1986,12/29/2023,08/13/1986,12/29/2023
4,00971T10,87299,17300,11,100,AKAM,AKAMAI TECHNOLOGIES INC,AKAM,511210,Q,...,1,1722,10/29/1999,12/29/2023,10/29/1999,12/29/2023,10/29/1999,12/29/2023,10/29/1999,12/29/2023


In [6]:
security = security_df["PERMNO"].astype(str).unique()
df = df[df["permno_id"].isin(security)]

In [7]:
print(len(df))
print(df["permno_id"].nunique())

809883
554


# Clean the data

## Columns of interest:

Numerical Features - window_size=128:
1. return
3. shares_outstanding
4. num_trades
5. volume
6. close_price
7. market_cap
8. volatility
9. sector_weighted_avg_return
10. sector_simple_avg_return

Categorical Features:
1. permno_id
2. company_name
3. primary_exchange
4. naics_sector

Time Features:
1. day_of_week
2. day_of_month
3. day_of_year
4. month_of_year
5. week_of_year


In [8]:
WINDOW = 128

In [9]:
exist_cols = ["date", "permno_id", "company_name", "primary_exchange", "naics_sector", "return", "shares_outstanding", "num_trades", "volume", "close_price"]

In [10]:
df = df[exist_cols]
df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price
9276,2018-01-02,10104,ORACLE CORP,N,51,-0.013748,4139602,99,25380018,46.63
9277,2018-01-03,10104,ORACLE CORP,N,51,0.023161,4139602,99,24165000,47.71
9278,2018-01-04,10104,ORACLE CORP,N,51,0.009851,4139602,99,19815057,48.18
9279,2018-01-05,10104,ORACLE CORP,N,51,0.006019,4139602,99,14496427,48.47
9280,2018-01-08,10104,ORACLE CORP,N,51,0.010522,4139602,99,15075007,48.98


In [11]:
df.columns

Index(['date', 'permno_id', 'company_name', 'primary_exchange', 'naics_sector',
       'return', 'shares_outstanding', 'num_trades', 'volume', 'close_price'],
      dtype='object')

In [12]:
assert len(df[df.isnull().any(axis=1)]) == 0 # confirm no missing values

## Calculate the market_cap

In [13]:
df["market_cap"] = (df["shares_outstanding"] * df["close_price"]).round(6)

## Calculate the returns of the sector corresponding to each stock, calculated using the market cap grouped by sector and date

In [14]:
# First, group the data by sector & date
grouped = df.groupby(["naics_sector", "date"]).agg({"market_cap": "sum"}).reset_index()

# Then, calculate the log return for each sector
grouped["market_cap_shifted"] = grouped.groupby("naics_sector")["market_cap"].shift(1)
grouped["sector_weighted_avg_return"] = (grouped["market_cap"] / grouped["market_cap_shifted"]).round(6)

grouped.head()

Unnamed: 0,naics_sector,date,market_cap,market_cap_shifted,sector_weighted_avg_return
0,11,2018-01-02,26645463.7,,
1,11,2018-01-03,26879460.69,26645463.7,1.008782
2,11,2018-01-04,26660560.28,26879460.69,0.991856
3,11,2018-01-05,26622818.83,26660560.28,0.998584
4,11,2018-01-08,26796429.5,26622818.83,1.006521


In [15]:
# Merge the sector returns back to the original dataframe
df = df.merge(
    grouped[["date", "naics_sector", "sector_weighted_avg_return"]],
    on=["date", "naics_sector"],
    how="left",
)

## Calculate the returns of the sector corresponding to each stock, calculated using the arithmetic average stock prices in the sector

In [16]:
# First, group the data by sector & date
grouped = (
    df.groupby(["naics_sector", "date"])
    .agg({"close_price": "mean"})
    .reset_index()
    .rename(columns={"close_price": "avg_price"})
)

# Then, calculate the return for each sector
grouped["avg_price_Shifted"] = grouped.groupby("naics_sector")["avg_price"].shift(1)
grouped["sector_simple_avg_return"] = (grouped["avg_price"] / grouped["avg_price_Shifted"]).round(6)

grouped.head()

Unnamed: 0,naics_sector,date,avg_price,avg_price_Shifted,sector_simple_avg_return
0,11,2018-01-02,35.3,,
1,11,2018-01-03,35.61,35.3,1.008782
2,11,2018-01-04,35.32,35.61,0.991856
3,11,2018-01-05,35.27,35.32,0.998584
4,11,2018-01-08,35.5,35.27,1.006521


In [17]:
# Merge the sector returns back to the original dataframe
df = df.merge(
    grouped[["date", "naics_sector", "sector_simple_avg_return"]],
    on=["date", "naics_sector"],
    how="left",
)

In [18]:
df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,market_cap,sector_weighted_avg_return,sector_simple_avg_return
0,2018-01-02,10104,ORACLE CORP,N,51,-0.013748,4139602,99,25380018,46.63,193029600.0,,
1,2018-01-03,10104,ORACLE CORP,N,51,0.023161,4139602,99,24165000,47.71,197500400.0,1.006893,1.013053
2,2018-01-04,10104,ORACLE CORP,N,51,0.009851,4139602,99,19815057,48.18,199446000.0,1.004791,1.003169
3,2018-01-05,10104,ORACLE CORP,N,51,0.006019,4139602,99,14496427,48.47,200646500.0,1.010298,1.009548
4,2018-01-08,10104,ORACLE CORP,N,51,0.010522,4139602,99,15075007,48.98,202757700.0,1.002166,1.006254


## Calculate volatility for the past 7 days

In [19]:
VOLATILITY_DAYS = 7

In [20]:
# Calculate rolling standard deviation (volatility) and annualize it
df[f"volatility_{VOLATILITY_DAYS}"] = (
        df.groupby("permno_id")["return"].rolling(window=VOLATILITY_DAYS).std().reset_index(0, drop=True)
    )
df[f"volatility_{VOLATILITY_DAYS}"] = df[f"volatility_{VOLATILITY_DAYS}"] * np.sqrt(252)

In [21]:
df.head(10)

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,market_cap,sector_weighted_avg_return,sector_simple_avg_return,volatility_7
0,2018-01-02,10104,ORACLE CORP,N,51,-0.013748,4139602,99,25380018,46.63,193029600.0,,,
1,2018-01-03,10104,ORACLE CORP,N,51,0.023161,4139602,99,24165000,47.71,197500400.0,1.006893,1.013053,
2,2018-01-04,10104,ORACLE CORP,N,51,0.009851,4139602,99,19815057,48.18,199446000.0,1.004791,1.003169,
3,2018-01-05,10104,ORACLE CORP,N,51,0.006019,4139602,99,14496427,48.47,200646500.0,1.010298,1.009548,
4,2018-01-08,10104,ORACLE CORP,N,51,0.010522,4139602,99,15075007,48.98,202757700.0,1.002166,1.006254,
5,2018-01-09,10104,ORACLE CORP,N,51,0.005512,4139602,99,14153666,49.06,203088900.0,0.997675,0.998602,
6,2018-01-10,10104,ORACLE CORP,N,51,-0.0053,4139602,99,13476601,48.8,202012600.0,0.996841,0.995893,0.188118
7,2018-01-11,10104,ORACLE CORP,N,51,0.003074,4139602,99,11687801,48.95,202633500.0,1.005552,1.003784,0.137399
8,2018-01-12,10104,ORACLE CORP,N,51,0.01144,4139602,99,15978201,49.51,204951700.0,1.004325,1.007441,0.091938
9,2018-01-16,10104,ORACLE CORP,N,51,0.001616,4139602,99,17152564,49.59,205282900.0,0.994055,0.996894,0.090238


## Drop all rows with NaN (which should be around num_stocks * volatility_days = ~ 500 * 7)

In [22]:
og_len = len(df)
og_len

809883

In [23]:
num_permno = df["permno_id"].nunique()
num_permno

554

In [24]:
df = df.dropna()
new_len = len(df)
new_len

806557

In [25]:
num_permno * VOLATILITY_DAYS

3878

In [26]:
og_len - new_len

3326

## Calculate the time features

In [27]:
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['day_of_year'] = df['date'].dt.dayofyear
df['month_of_year'] = df['date'].dt.month
df['week_of_year'] = df['date'].dt.isocalendar().week
df['year'] = df['date'].dt.year

In [28]:
df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,market_cap,sector_weighted_avg_return,sector_simple_avg_return,volatility_7,day_of_week,day_of_month,day_of_year,month_of_year,week_of_year,year
6,2018-01-10,10104,ORACLE CORP,N,51,-0.0053,4139602,99,13476601,48.8,202012600.0,0.996841,0.995893,0.188118,2,10,10,1,2,2018
7,2018-01-11,10104,ORACLE CORP,N,51,0.003074,4139602,99,11687801,48.95,202633500.0,1.005552,1.003784,0.137399,3,11,11,1,2,2018
8,2018-01-12,10104,ORACLE CORP,N,51,0.01144,4139602,99,15978201,49.51,204951700.0,1.004325,1.007441,0.091938,4,12,12,1,2,2018
9,2018-01-16,10104,ORACLE CORP,N,51,0.001616,4139602,99,17152564,49.59,205282900.0,0.994055,0.996894,0.090238,1,16,16,1,3,2018
10,2018-01-17,10104,ORACLE CORP,N,51,0.013712,4139602,99,23183110,50.27,208097800.0,1.008528,1.007648,0.105486,2,17,17,1,3,2018


## Apply sin() cos() transformation to time features

In [29]:
# helper functions
def get_days_in_month(row):
    return calendar.monthrange(row["year"], row["month_of_year"])[1]

def get_days_in_year(row):
    return 366 if calendar.isleap(row["year"]) else 365

df["days_in_month"] = df.apply(get_days_in_month, axis=1)
df["days_in_year"] = df.apply(get_days_in_year, axis=1)

def transform_time_features(df, demoninator, feature):
    df[f"{feature}_x"] = np.sin(np.deg2rad(360 / demoninator) * df[feature])
    scaler = MinMaxScaler()
    df[f"{feature}_x"] = scaler.fit_transform(df[f"{feature}_x"].values.reshape(-1, 1))

    df[f"{feature}_y"] = np.cos(np.deg2rad(360 / demoninator) * df[feature])
    scaler = MinMaxScaler()
    df[f"{feature}_y"] = scaler.fit_transform(df[f"{feature}_y"].values.reshape(-1, 1))

    return df

# calculate sin() & cos() of day_of_week
df = transform_time_features(df, 7, "day_of_week")

# calculate sin() & cos() of day_of_month
df = transform_time_features(df, df["days_in_month"], "day_of_month")

# calculate sin() & cos() of day_of_year
df = transform_time_features(df, df["days_in_year"], "day_of_year")

# calculate sin() & cos() of month_of_year
df = transform_time_features(df, 12, "month_of_year")

# calculate sin() & cos() of week_of_year
df = transform_time_features(df, 53, "week_of_year")

## Look at each features to confirm validity

In [30]:
assert len(df[df.isnull().any(axis=1)]) == 0 # confirm no missing values

In [31]:
numerical_features = ["return", "shares_outstanding", "num_trades", "volume", "close_price", "market_cap", "volatility_7", "sector_weighted_avg_return", "sector_simple_avg_return"]
categorical_features = ["permno_id", "company_name", "primary_exchange", "naics_sector"]
day_cols = ["day_of_week", "day_of_month", "day_of_year", "month_of_year", "week_of_year", "year"]
day_features = ["day_of_week_x", "day_of_week_y",
                        "day_of_month_x", "day_of_month_y",
                        "day_of_year_x", "day_of_year_y",
                        "month_of_year_x", "month_of_year_y",
                        "week_of_year_x", "week_of_year_y"]

In [32]:
df[numerical_features].describe()

Unnamed: 0,return,shares_outstanding,num_trades,volume,close_price,market_cap,volatility_7,sector_weighted_avg_return,sector_simple_avg_return
count,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0
mean,0.000611,596818.8,16574.67,4547092.0,155.767178,59589380.0,0.301147,1.000576,1.000437
std,0.023593,1113347.0,62020.55,9955856.0,284.262076,146605300.0,0.226581,0.024332,0.020359
min,-0.538647,3179.0,19.0,0.0,0.2839,33787.83,0.006599,0.061346,0.330964
25%,-0.009607,140082.0,99.0,920711.0,50.9,13456830.0,0.168278,0.99359,0.993418
50%,0.0008,282598.0,99.0,1895319.0,92.48,24307250.0,0.244257,1.000889,1.000959
75%,0.010881,579798.0,14708.0,4279256.0,169.99001,51344340.0,0.359902,1.008025,1.008122
max,0.877212,17102540.0,2970000.0,536739400.0,7024.81982,3081156000.0,6.119483,11.277719,11.750482


In [33]:
df[categorical_features].describe() # confirm unique counts is reasonable

Unnamed: 0,permno_id,company_name,primary_exchange,naics_sector
count,806557,806557,806557,806557
unique,554,603,3,24
top,93436,LIBERTY MEDIA CORP 3RD NEW,N,33
freq,1503,6012,524321,175866


In [34]:
df[day_cols].describe() # confirm unique counts, min, & max are reasonable

Unnamed: 0,day_of_week,day_of_month,day_of_year,month_of_year,week_of_year,year
count,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0
mean,2.027223,15.782888,184.515235,6.569492,26.751564,2020.515497
std,1.398556,8.750984,104.270674,3.414558,14.901383,1.69817
min,0.0,1.0,2.0,1.0,1.0,2018.0
25%,1.0,8.0,94.0,4.0,14.0,2019.0
50%,2.0,16.0,186.0,7.0,27.0,2021.0
75%,3.0,23.0,275.0,10.0,40.0,2022.0
max,4.0,31.0,366.0,12.0,53.0,2023.0


In [35]:
df[day_features].describe() # confirm values between 0 & 1

Unnamed: 0,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
count,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0,806557.0
mean,0.564145,0.4237819,0.501686,0.494918,0.497512,0.494476,0.493744,0.493681,0.497287,0.486028
std,0.365978,0.4057138,0.35324,0.353826,0.356345,0.350691,0.354385,0.352609,0.359132,0.348081
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.307979,5.5511150000000004e-17,0.137604,0.120621,0.141158,0.145687,0.066987,0.066987,0.130921,0.140492
50%,0.615957,0.3568959,0.5,0.474675,0.491417,0.493545,0.5,0.5,0.470367,0.455124
75%,0.862937,0.8019377,0.862396,0.844483,0.855832,0.845087,0.75,0.75,0.869079,0.837349
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Preprocess the data

## Split the data to train vs validation

- Train: < 1/1/23
- Validation: >= 1/1/23 (use the last half year of 2022 to predict 2023)

In [36]:
# Split df to train/validation so that data prior to 2023 are used for training and data in 2023 are used for validation
train_df = df[df['date'] < '2023-01-01'].reset_index(drop=True)
train_df = train_df.sort_values(by=['permno_id','date']).reset_index(drop=True)

val_df = df[df['date'] >= '2022-07-01'].reset_index(drop=True)
val_df = val_df.sort_values(by=['permno_id','date']).reset_index(drop=True)

# Print the number of rows in the train and validation sets
print(f"Number of rows in train set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")

Number of rows in train set: 672127
Number of rows in validation set: 202732


## Drop permno if only exist in train but not in val (as they are no longer traded and no longer relevant). Also drop permnos if only exist in val but not in train (as they don't have training data to generate results)

In [37]:
permnos_to_drop = set(train_df['permno_id'].unique()) - set(val_df['permno_id'].unique())
train_df = train_df[~train_df['permno_id'].isin(permnos_to_drop)]

permnos_to_drop = set(val_df['permno_id'].unique()) - set(train_df['permno_id'].unique())
val_df = val_df[~val_df['permno_id'].isin(permnos_to_drop)]

# Print the number of rows in the train and validation sets after dropping
print(f"Number of rows in train set after dropping: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")

assert train_df['permno_id'].nunique() == val_df['permno_id'].nunique()

Number of rows in train set after dropping: 663301
Number of rows in validation set: 202489


## Apply MinMaxScaler to numerical features

In [38]:
permno_ids = train_df["permno_id"].unique()

In [39]:
scaled_numerical_features = [f'{col}_scaled' for col in numerical_features]

In [40]:
sample_train = train_df[train_df["permno_id"]=="10104"]
sample_val = val_df[val_df["permno_id"]=="10104"]

sample_scaler = MinMaxScaler()
sample_train[scaled_numerical_features] = sample_scaler.fit_transform(sample_train[numerical_features].values)
sample_val[scaled_numerical_features] = sample_scaler.transform(sample_val[numerical_features].values)

display(sample_train.head())
display(sample_val.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_train[scaled_numerical_features] = sample_scaler.fit_transform(sample_train[numerical_features].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_train[scaled_numerical_features] = sample_scaler.fit_transform(sample_train[numerical_features].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,...,week_of_year_y,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
0,2018-01-10,10104,ORACLE CORP,N,51,-0.0053,4139602,99,13476601,48.8,...,0.985999,0.33319,1.0,0.0,0.16283,0.140955,0.477564,0.087153,0.664672,0.744593
1,2018-01-11,10104,ORACLE CORP,N,51,0.003074,4139602,99,11687801,48.95,...,0.985999,0.359834,1.0,0.0,0.135664,0.143305,0.481552,0.059969,0.69034,0.765986
2,2018-01-12,10104,ORACLE CORP,N,51,0.01144,4139602,99,15978201,49.51,...,0.985999,0.386453,1.0,0.0,0.200822,0.152075,0.496443,0.035603,0.686724,0.7759
3,2018-01-16,10104,ORACLE CORP,N,51,0.001616,4139602,99,17152564,49.59,...,0.968682,0.355195,1.0,0.0,0.218657,0.153328,0.49857,0.034692,0.656463,0.747307
4,2018-01-17,10104,ORACLE CORP,N,51,0.013712,4139602,99,23183110,50.27,...,0.968682,0.393682,1.0,0.0,0.310242,0.163978,0.516651,0.042865,0.699109,0.776461


Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,...,week_of_year_y,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
0,2022-07-01,10104,ORACLE CORP,N,51,0.014312,2664926,99,7805555,70.87,...,0.0,0.395591,0.0,0.0,0.076704,0.486609,0.393101,0.170303,0.712372,0.802512
1,2022-07-05,10104,ORACLE CORP,N,51,0.015804,2664926,99,9043000,71.99,...,0.0,0.400339,0.0,0.0,0.095497,0.50415,0.412273,0.169969,0.719517,0.822382
2,2022-07-06,10104,ORACLE CORP,N,51,-0.004584,2664926,99,6043679,71.66,...,0.0,0.335468,0.0,0.0,0.049947,0.498982,0.406625,0.112269,0.692951,0.766688
3,2022-07-07,10104,ORACLE CORP,N,51,0.002372,2664926,99,7814621,71.83,...,0.0,0.357601,0.0,0.0,0.076842,0.501644,0.409535,0.110174,0.700906,0.801799
4,2022-07-08,10104,ORACLE CORP,N,51,0.000557,2664926,99,4927307,71.87,...,0.0,0.351826,0.0,0.0,0.032993,0.502271,0.410219,0.050265,0.660688,0.743484


In [41]:
scalers = {}
for col in numerical_features:

  # Initialize a dictionary to store scalers for each permno
  permno_scalers = {}

  # Group the training DataFrame by 'permno_id'
  for permno, group in train_df.groupby('permno_id'):
    scaler = MinMaxScaler()
    permno_scalers[permno] = scaler

    # Fit and transform the data for the current permno
    train_df.loc[group.index, f'{col}_scaled'] = scaler.fit_transform(group[col].values.reshape(-1, 1))

  # Store the dictionary of permno scalers in the main scalers dictionary
  scalers[col] = permno_scalers

  # Transform the validation data using the corresponding scaler for each permno
  for permno in val_df['permno_id'].unique():
    if permno in permno_scalers:
      scaler = permno_scalers[permno]
      val_df.loc[val_df['permno_id'] == permno, f'{col}_scaled'] = scaler.transform(val_df.loc[val_df['permno_id'] == permno, col].values.reshape(-1, 1))

In [42]:
train_df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,...,week_of_year_y,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
0,2018-01-10,10104,ORACLE CORP,N,51,-0.0053,4139602,99,13476601,48.8,...,0.985999,0.33319,1.0,0.0,0.16283,0.140955,0.477564,0.087153,0.664672,0.744593
1,2018-01-11,10104,ORACLE CORP,N,51,0.003074,4139602,99,11687801,48.95,...,0.985999,0.359834,1.0,0.0,0.135664,0.143305,0.481552,0.059969,0.69034,0.765986
2,2018-01-12,10104,ORACLE CORP,N,51,0.01144,4139602,99,15978201,49.51,...,0.985999,0.386453,1.0,0.0,0.200822,0.152075,0.496443,0.035603,0.686724,0.7759
3,2018-01-16,10104,ORACLE CORP,N,51,0.001616,4139602,99,17152564,49.59,...,0.968682,0.355195,1.0,0.0,0.218657,0.153328,0.49857,0.034692,0.656463,0.747307
4,2018-01-17,10104,ORACLE CORP,N,51,0.013712,4139602,99,23183110,50.27,...,0.968682,0.393682,1.0,0.0,0.310242,0.163978,0.516651,0.042865,0.699109,0.776461


In [43]:
val_df.head()

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return,shares_outstanding,num_trades,volume,close_price,...,week_of_year_y,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
0,2022-07-01,10104,ORACLE CORP,N,51,0.014312,2664926,99,7805555,70.87,...,0.0,0.395591,0.0,0.0,0.076704,0.486609,0.393101,0.170303,0.712372,0.802512
1,2022-07-05,10104,ORACLE CORP,N,51,0.015804,2664926,99,9043000,71.99,...,0.0,0.400339,0.0,0.0,0.095497,0.50415,0.412273,0.169969,0.719517,0.822382
2,2022-07-06,10104,ORACLE CORP,N,51,-0.004584,2664926,99,6043679,71.66,...,0.0,0.335468,0.0,0.0,0.049947,0.498982,0.406625,0.112269,0.692951,0.766688
3,2022-07-07,10104,ORACLE CORP,N,51,0.002372,2664926,99,7814621,71.83,...,0.0,0.357601,0.0,0.0,0.076842,0.501644,0.409535,0.110174,0.700906,0.801799
4,2022-07-08,10104,ORACLE CORP,N,51,0.000557,2664926,99,4927307,71.87,...,0.0,0.351826,0.0,0.0,0.032993,0.502271,0.410219,0.050265,0.660688,0.743484


## Look at each features to confirm validity

In [44]:
assert len(train_df[train_df.isnull().any(axis=1)]) == 0 # confirm no missing values
assert len(val_df[val_df.isnull().any(axis=1)]) == 0 # confirm no missing values

In [45]:
train_df[scaled_numerical_features].describe() # confirm values between 0 & 1

Unnamed: 0,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
count,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0
mean,0.501372,0.496149,0.055997,0.118863,0.461112,0.463761,0.153699,0.487245,0.573413
std,0.104852,0.347072,0.10655,0.10524,0.250024,0.251942,0.127156,0.128617,0.108847
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.436078,0.162966,0.0,0.047307,0.252663,0.253772,0.074926,0.43185,0.506638
50%,0.503044,0.485891,0.0,0.092369,0.45,0.45519,0.120847,0.48489,0.569286
75%,0.567411,0.835539,0.079411,0.157502,0.663222,0.669319,0.190498,0.546394,0.623822
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
val_df[numerical_features].describe() # original val_numerical

Unnamed: 0,return,shares_outstanding,num_trades,volume,close_price,market_cap,volatility_7,sector_weighted_avg_return,sector_simple_avg_return
count,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0
mean,0.000679,644753.8,22462.43,4729965.0,168.083514,68764300.0,0.286588,1.000729,1.000546
std,0.021079,1224429.0,82633.91,10479260.0,315.513035,178698300.0,0.179728,0.021602,0.013845
min,-0.396707,3179.0,99.0,0.0,0.2839,57201.02,0.006599,0.292597,0.725691
25%,-0.009686,147897.0,99.0,931008.0,54.15,15227640.0,0.174755,0.993143,0.992846
50%,0.00056,292050.0,99.0,1915997.0,99.69,28576260.0,0.244753,1.000447,1.000499
75%,0.010694,611873.0,21313.0,4342689.0,190.82001,57708360.0,0.348438,1.007786,1.008039
max,0.573321,16095380.0,2630000.0,364261200.0,7024.81982,3081156000.0,5.872012,2.310367,1.119866


In [47]:
val_df[scaled_numerical_features].describe() # scaled val_numerical using training data to scale

Unnamed: 0,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,market_cap_scaled,volatility_7_scaled,sector_weighted_avg_return_scaled,sector_simple_avg_return_scaled
count,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0
mean,0.501478,0.518711,37.412488,0.118112,0.559355,0.578131,0.141423,0.486975,0.573908
std,0.100059,2.369091,1263.513286,0.11803,0.30415,0.287836,0.101536,0.130445,0.10394
min,-0.786041,-21.649718,-0.072322,-0.022368,-0.578832,-0.432881,-0.112605,-2.01693,-0.413532
25%,0.43695,0.0,0.0,0.047382,0.347025,0.377294,0.076477,0.432055,0.509529
50%,0.502394,0.235911,0.0,0.08775,0.582218,0.590876,0.117488,0.483496,0.568364
75%,0.56611,0.969275,0.144597,0.150528,0.770155,0.773484,0.177915,0.544308,0.622976
max,2.332557,76.428571,147156.0,4.63069,1.953374,1.929882,2.159781,5.904366,1.183219


In [48]:
train_df[categorical_features].describe() # confirm unique counts is reasonable

Unnamed: 0,permno_id,company_name,primary_exchange,naics_sector
count,663301,663301,663301,663301
unique,539,580,3,24
top,93436,LIBERTY MEDIA CORP 3RD NEW,N,33
freq,1253,5012,437205,143871


In [49]:
val_df[categorical_features].describe() # confirm unique counts is reasonable

Unnamed: 0,permno_id,company_name,primary_exchange,naics_sector
count,202489,202489,202489,202489
unique,539,544,3,22
top,93436,LIBERTY MEDIA CORP 3RD NEW,N,33
freq,377,1508,130065,43710


In [50]:
train_df[day_features].describe() # confirm values between 0 & 1

Unnamed: 0,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
count,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0,663301.0
mean,0.563966,0.4249654,0.501495,0.494972,0.497206,0.494164,0.493274,0.49353,0.496948,0.485777
std,0.365689,0.406064,0.353271,0.353797,0.356257,0.350773,0.354213,0.35277,0.359014,0.34819
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.307979,5.5511150000000004e-17,0.137604,0.120621,0.138938,0.145687,0.066987,0.066987,0.130921,0.140492
50%,0.615957,0.3568959,0.5,0.474675,0.491417,0.493545,0.5,0.5,0.470367,0.455124
75%,0.862937,0.8019377,0.862396,0.844483,0.855832,0.839637,0.75,0.75,0.869079,0.837349
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
val_df[day_features].describe() # confirm values between 0 & 1

Unnamed: 0,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
count,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0,202489.0
mean,0.562869,0.4180007,0.501666,0.493275,0.390474,0.496986,0.390907,0.52426,0.388828,0.486356
std,0.368457,0.4041651,0.3526,0.354439,0.338789,0.351028,0.338075,0.351023,0.341323,0.348271
min,0.0,0.0,0.0,0.0,3.7e-05,1.9e-05,0.0,0.0,0.0,0.0
25%,0.307979,5.5511150000000004e-17,0.137604,0.120621,0.079687,0.145687,0.066987,0.25,0.077016,0.140492
50%,0.615957,0.3568959,0.5,0.474675,0.299254,0.493545,0.25,0.5,0.29835,0.455124
75%,0.862937,0.8019377,0.862396,0.844483,0.696797,0.845087,0.75,0.933013,0.70165,0.837349
max,1.0,1.0,1.0,1.0,0.999889,0.999926,1.0,1.0,1.0,0.996487


# Clean-up train_df and val_df & save for future use

In [52]:
cols_keep = ["date"] + categorical_features + scaled_numerical_features + day_features
train_df = train_df[cols_keep]
val_df = val_df[cols_keep]

display(len(train_df))
display(len(val_df))

display(train_df.columns)
display(val_df.columns)

display(train_df.head())
display(val_df.head())

663301

202489

Index(['date', 'permno_id', 'company_name', 'primary_exchange', 'naics_sector',
       'return_scaled', 'shares_outstanding_scaled', 'num_trades_scaled',
       'volume_scaled', 'close_price_scaled', 'market_cap_scaled',
       'volatility_7_scaled', 'sector_weighted_avg_return_scaled',
       'sector_simple_avg_return_scaled', 'day_of_week_x', 'day_of_week_y',
       'day_of_month_x', 'day_of_month_y', 'day_of_year_x', 'day_of_year_y',
       'month_of_year_x', 'month_of_year_y', 'week_of_year_x',
       'week_of_year_y'],
      dtype='object')

Index(['date', 'permno_id', 'company_name', 'primary_exchange', 'naics_sector',
       'return_scaled', 'shares_outstanding_scaled', 'num_trades_scaled',
       'volume_scaled', 'close_price_scaled', 'market_cap_scaled',
       'volatility_7_scaled', 'sector_weighted_avg_return_scaled',
       'sector_simple_avg_return_scaled', 'day_of_week_x', 'day_of_week_y',
       'day_of_month_x', 'day_of_month_y', 'day_of_year_x', 'day_of_year_y',
       'month_of_year_x', 'month_of_year_y', 'week_of_year_x',
       'week_of_year_y'],
      dtype='object')

Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,...,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
0,2018-01-10,10104,ORACLE CORP,N,51,0.33319,1.0,0.0,0.16283,0.140955,...,1.0,0.3568959,0.948902,0.279803,0.585647,0.99261,0.75,0.933013,0.617495,0.985999
1,2018-01-11,10104,ORACLE CORP,N,51,0.359834,1.0,0.0,0.135664,0.143305,...,0.615957,5.5511150000000004e-17,0.895388,0.193947,0.594114,0.991063,0.75,0.933013,0.617495,0.985999
2,2018-01-12,10104,ORACLE CORP,N,51,0.386453,1.0,0.0,0.200822,0.152075,...,0.0,0.0,0.825686,0.120621,0.602553,0.98937,0.75,0.933013,0.617495,0.985999
3,2018-01-16,10104,ORACLE CORP,N,51,0.355195,1.0,0.0,0.218657,0.153328,...,0.862937,0.8019377,0.449416,0.002565,0.63598,0.981155,0.75,0.933013,0.674177,0.968682
4,2018-01-17,10104,ORACLE CORP,N,51,0.393682,1.0,0.0,0.310242,0.163978,...,1.0,0.3568959,0.350318,0.02293,0.644243,0.978743,0.75,0.933013,0.674177,0.968682


Unnamed: 0,date,permno_id,company_name,primary_exchange,naics_sector,return_scaled,shares_outstanding_scaled,num_trades_scaled,volume_scaled,close_price_scaled,...,day_of_week_x,day_of_week_y,day_of_month_x,day_of_month_y,day_of_year_x,day_of_year_y,month_of_year_x,month_of_year_y,week_of_year_x,week_of_year_y
0,2022-07-01,10104,ORACLE CORP,N,51,0.395591,0.0,0.0,0.076704,0.486609,...,0.0,0.0,0.600649,0.989765,0.504304,1.9e-05,0.25,0.066987,0.529633,0.0
1,2022-07-05,10104,ORACLE CORP,N,51,0.400339,0.0,0.0,0.095497,0.50415,...,0.862937,0.8019377,0.924322,0.764482,0.469893,0.000907,0.25,0.066987,0.470367,0.0
2,2022-07-06,10104,ORACLE CORP,N,51,0.335468,0.0,0.0,0.049947,0.498982,...,1.0,0.3568959,0.968876,0.673653,0.461306,0.001499,0.25,0.066987,0.470367,0.0
3,2022-07-07,10104,ORACLE CORP,N,51,0.357601,0.0,0.0,0.076842,0.501644,...,0.615957,5.5511150000000004e-17,0.994234,0.575714,0.452731,0.002239,0.25,0.066987,0.470367,0.0
4,2022-07-08,10104,ORACLE CORP,N,51,0.351826,0.0,0.0,0.032993,0.502271,...,0.0,0.0,0.999358,0.474675,0.44417,0.003127,0.25,0.066987,0.470367,0.0


In [53]:
display(train_df.dtypes)
display(val_df.dtypes)

date                                 datetime64[ns]
permno_id                                    object
company_name                                 object
primary_exchange                             object
naics_sector                                 object
return_scaled                               float64
shares_outstanding_scaled                   float64
num_trades_scaled                           float64
volume_scaled                               float64
close_price_scaled                          float64
market_cap_scaled                           float64
volatility_7_scaled                         float64
sector_weighted_avg_return_scaled           float64
sector_simple_avg_return_scaled             float64
day_of_week_x                               float64
day_of_week_y                               float64
day_of_month_x                              float64
day_of_month_y                              float64
day_of_year_x                               float64
day_of_year_

date                                 datetime64[ns]
permno_id                                    object
company_name                                 object
primary_exchange                             object
naics_sector                                 object
return_scaled                               float64
shares_outstanding_scaled                   float64
num_trades_scaled                           float64
volume_scaled                               float64
close_price_scaled                          float64
market_cap_scaled                           float64
volatility_7_scaled                         float64
sector_weighted_avg_return_scaled           float64
sector_simple_avg_return_scaled             float64
day_of_week_x                               float64
day_of_week_y                               float64
day_of_month_x                              float64
day_of_month_y                              float64
day_of_year_x                               float64
day_of_year_

In [54]:
assert len(train_df[train_df.isnull().any(axis=1)]) == 0
assert len(val_df[val_df.isnull().any(axis=1)]) == 0

In [55]:
assert len(train_df[np.isinf(train_df[scaled_numerical_features + day_features]).any(axis=1)]) == 0
assert len(val_df[np.isinf(val_df[scaled_numerical_features + day_features]).any(axis=1)]) == 0

# Save the data for future use

In [56]:
train_df.to_parquet("/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_train.parquet")
val_df.to_parquet("/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_val.parquet")

In [57]:
s3.upload_file(
    "/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_train.parquet",
    bucket,
    "CRSP/crsp_rachel_train.parquet",
)

s3.upload_file(
    "/home/sagemaker-user/capstone-2024-summer/data/crsp_rachel_val.parquet",
    bucket,
    "CRSP/crsp_rachel_val.parquet",
)