In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [247]:
# May take a few seconds to run since the dataset is large
df = pd.read_excel('stock_data_super_large.xlsx')

In [249]:
# We have around 4000 companies in the dataset, 188 columns 
df.head()

Unnamed: 0,Identifier (RIC),Company Name,Industry,ESG_FY0,ESG_FY1,ESG_FY2,ESG_FY3,ESG_FY4,ESG_FY5,ESG_FY6,...,MCAP_FQ22,MCAP_FQ23,MCAP_FQ24,MCAP_FQ25,MCAP_FQ26,MCAP_FQ27,MCAP_FQ28,MCAP_FQ29,MCAP_FQ30,MCAP_FQ31
0,B,Barnes Group Inc,Machinery,47.280059,39.716448,39.364964,33.635145,24.153344,25.935442,18.761357,...,2758021000.0,2551881000.0,2183471000.0,1778151000.0,1880405000.0,1942959000.0,1971735000.0,2136121000.0,2218038000.0,2015078000.0
1,ITT.N,ITT Inc,Machinery,78.612516,70.72285,68.409017,59.747878,58.30117,54.710253,58.075415,...,3637016000.0,3401874000.0,3211264000.0,2881398000.0,3313802000.0,3250640000.0,2988642000.0,3732128000.0,3607767000.0,3706136000.0
2,GTLS.N,Chart Industries Inc,Machinery,56.790911,43.96815,41.893694,16.687256,21.707709,24.478215,,...,1073148000.0,1102080000.0,1004369000.0,738185900.0,664365700.0,548579300.0,586763300.0,1091904000.0,1071162000.0,1042462000.0
3,PKOH.OQ,Park Ohio Holdings Corp,Machinery,11.351879,12.015005,12.092304,12.603972,12.932632,2.580297,,...,451467100.0,527716200.0,457563000.0,347386500.0,538197000.0,465400100.0,364180200.0,605506300.0,658141800.0,788570800.0
4,ZWS.N,Zurn Elkay Water Solutions Corp,Building Products,61.015179,59.600678,42.460351,22.93073,21.765882,22.323763,,...,2371729000.0,2013093000.0,2199966000.0,1996711000.0,2049765000.0,1818622000.0,1703909000.0,2434572000.0,2714624000.0,2868755000.0


In [222]:
# fiscal year 7 has the least number of ESG observations
#df.loc[:,'ESG_FY0':'ESG_FY7'].count(axis=0)
# drop ESG_FY7 to have more non-NA data
#df.drop('ESG_FY7', axis=1, inplace=True)
# if we drop FY7, then we can 18 more observations (417 vs ~300) after dropping NaNs
#df_drop = df.dropna(inplace=True)

In [232]:
df.count()

Identifier (RIC)    4243
Company Name        4243
Industry            4243
ESG_FY0             1926
ESG_FY1             1837
                    ... 
MCAP_FY3            1669
MCAP_FY4            1636
MCAP_FY5            1590
MCAP_FY6            1561
MCAP_FY7            1531
Length: 188, dtype: int64

In [250]:
# find the average market cap for each company, column lables MCAP_FY0 ~ MCAP_FY7
df['avg_mcap'] = df.loc[:,'MCAP_FQ0':'MCAP_FQ31'].mean(axis=1)
# same for ESG score
df['avg_esg'] = df.loc[:,'ESG_FY0':'ESG_FY6'].mean(axis=1)
# same for price
df['avg_price'] = df.loc[:,'Price_CM0':'Price_CM135'].mean(axis=1)
# book to market ratio
df['avg_btm'] = df.loc[:,'BTM_FQ0':'BTM_FQ31'].mean(axis=1)

In [251]:
# produce two dataframes, one with market cap lower than the median, one with market cap higher than the median
df_mcap_low = df[df['avg_mcap'] < df['avg_mcap'].median()]
df_mcap_high = df[df['avg_mcap'] >= df['avg_mcap'].median()]

# produce two dataframes, one with Price / Book Value Per Share lower than the median, one with Price / Book Value Per Share higher than the median
df_BM_low = df[df['avg_btm'] < df['avg_btm'].median()]
df_BM_high = df[df['avg_btm'] >= df['avg_btm'].median()]

# now for ESG score
df_esg_low = df[df['avg_esg'] < df['avg_esg'].median()]
df_esg_high = df[df['avg_esg'] >= df['avg_esg'].median()]

In [252]:
# check how many rows are in each dataframe
print(df_mcap_low.shape)

(2105, 219)


In [254]:
# find the return of each company, column lables Price_FY0 ~ Price_FY7
# return is Price_FY6 - Price_FY0
# for low market cap group:
df_mcap_low['mcap_low_return'] = df_mcap_low['Price_CM135'] - df_mcap_low['Price_CM0']

# for high market cap group:
df_mcap_high['mcap_high_return'] = df_mcap_high['Price_CM135'] - df_mcap_high['Price_CM0']

# for low book-to-market group:
df_BM_low['BM_low_return'] = df_BM_low['Price_CM135'] - df_BM_low['Price_CM0']

# for high book-to-market group:
df_BM_high['BM_high_return'] = df_BM_high['Price_CM135'] - df_BM_high['Price_CM0']

# for low ESG score group:
df_esg_low['esg_low_return'] = df_esg_low['Price_CM135'] - df_esg_low['Price_CM0']

# for high ESG score group:
df_esg_high['esg_high_return'] = df_esg_high['Price_CM135'] - df_esg_high['Price_CM0']


In [256]:
# calculate the returns
# for market cap:
# return is calculated as Price_CM135 - Price_CM0
# for low market cap group:
df_mcap_low['mcap_low_return'] = df_mcap_low['Price_CM135'] - df_mcap_low['Price_CM0']
# for high market cap group:
df_mcap_high['mcap_high_return'] = df_mcap_high['Price_CM135'] - df_mcap_high['Price_CM0']

# for book-to-market:
# return is calculated as Price_CM135 - Price_CM0
# for low book-to-market group:
df_BM_low['BM_low_return'] = df_BM_low['Price_CM135'] - df_BM_low['Price_CM0']
# for high book-to-market group:
df_BM_high['BM_high_return'] = df_BM_high['Price_CM135'] - df_BM_high['Price_CM0']

# for ESG score:
# return is calculated as Price_CM135 - Price_CM0
# for low ESG score group:
df_esg_low['esg_low_return'] = df_esg_low['Price_CM135'] - df_esg_low['Price_CM0']
# for high ESG score group:
df_esg_high['esg_high_return'] = df_esg_high['Price_CM135'] - df_esg_high['Price_CM0']


In [None]:
# now calculate the return spread for each group
# for market cap:


In [203]:
# rank the companies and find the medium
# split the data into two groups, one with the companies that have a market cap above the median, and one with the companies that have a market cap below the median
df_avg['Rank'] = df_avg['Mkt. Cap (M)'].rank(ascending=False)
df_avg_low = df_avg[df_avg['Rank'] > df_avg['Rank'].median()]
df_avg_high = df_avg[df_avg['Rank'] <= df_avg['Rank'].median()]

# count the number of companies in each group
print('Number of companies with a market cap above the median: ', len(df_avg_high))
print('Number of companies with a market cap below the median: ', len(df_avg_low))


Number of companies with a market cap above the median:  5356
Number of companies with a market cap below the median:  5355
