In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [247]:
# May take a few seconds to run since the dataset is large
df = pd.read_excel('stock_data_super_large.xlsx')

In [249]:
# We have around 4000 companies in the dataset, 188 columns 
df.head()

Unnamed: 0,Identifier (RIC),Company Name,Industry,ESG_FY0,ESG_FY1,ESG_FY2,ESG_FY3,ESG_FY4,ESG_FY5,ESG_FY6,...,MCAP_FQ22,MCAP_FQ23,MCAP_FQ24,MCAP_FQ25,MCAP_FQ26,MCAP_FQ27,MCAP_FQ28,MCAP_FQ29,MCAP_FQ30,MCAP_FQ31
0,B,Barnes Group Inc,Machinery,47.280059,39.716448,39.364964,33.635145,24.153344,25.935442,18.761357,...,2758021000.0,2551881000.0,2183471000.0,1778151000.0,1880405000.0,1942959000.0,1971735000.0,2136121000.0,2218038000.0,2015078000.0
1,ITT.N,ITT Inc,Machinery,78.612516,70.72285,68.409017,59.747878,58.30117,54.710253,58.075415,...,3637016000.0,3401874000.0,3211264000.0,2881398000.0,3313802000.0,3250640000.0,2988642000.0,3732128000.0,3607767000.0,3706136000.0
2,GTLS.N,Chart Industries Inc,Machinery,56.790911,43.96815,41.893694,16.687256,21.707709,24.478215,,...,1073148000.0,1102080000.0,1004369000.0,738185900.0,664365700.0,548579300.0,586763300.0,1091904000.0,1071162000.0,1042462000.0
3,PKOH.OQ,Park Ohio Holdings Corp,Machinery,11.351879,12.015005,12.092304,12.603972,12.932632,2.580297,,...,451467100.0,527716200.0,457563000.0,347386500.0,538197000.0,465400100.0,364180200.0,605506300.0,658141800.0,788570800.0
4,ZWS.N,Zurn Elkay Water Solutions Corp,Building Products,61.015179,59.600678,42.460351,22.93073,21.765882,22.323763,,...,2371729000.0,2013093000.0,2199966000.0,1996711000.0,2049765000.0,1818622000.0,1703909000.0,2434572000.0,2714624000.0,2868755000.0


In [222]:
# fiscal year 7 has the least number of ESG observations
#df.loc[:,'ESG_FY0':'ESG_FY7'].count(axis=0)
# drop ESG_FY7 to have more non-NA data
#df.drop('ESG_FY7', axis=1, inplace=True)
# if we drop FY7, then we can 18 more observations (417 vs ~300) after dropping NaNs
#df_drop = df.dropna(inplace=True)

In [232]:
df.count()

Identifier (RIC)    4243
Company Name        4243
Industry            4243
ESG_FY0             1926
ESG_FY1             1837
                    ... 
MCAP_FY3            1669
MCAP_FY4            1636
MCAP_FY5            1590
MCAP_FY6            1561
MCAP_FY7            1531
Length: 188, dtype: int64

In [250]:
# find the average market cap for each company, column lables MCAP_FY0 ~ MCAP_FY7
df['avg_mcap'] = df.loc[:,'MCAP_FQ0':'MCAP_FQ31'].mean(axis=1)
# same for ESG score
df['avg_esg'] = df.loc[:,'ESG_FY0':'ESG_FY6'].mean(axis=1)
# same for price
df['avg_price'] = df.loc[:,'Price_CM0':'Price_CM135'].mean(axis=1)
# book to market ratio
df['avg_btm'] = df.loc[:,'BTM_FQ0':'BTM_FQ31'].mean(axis=1)

In [284]:
# produce two dataframes, one with market cap lower than the median, one with market cap higher than the median
df_mcap_low = df[df['avg_mcap'] < df['avg_mcap'].median()]
df_mcap_high = df[df['avg_mcap'] >= df['avg_mcap'].median()]

# produce two dataframes, one with Price / Book Value Per Share lower than the median, one with Price / Book Value Per Share higher than the median
df_BM_low = df[df['avg_btm'] < df['avg_btm'].median()]
df_BM_high = df[df['avg_btm'] >= df['avg_btm'].median()]

# now for ESG score
df_esg_low = df[df['avg_esg'] < df['avg_esg'].median()]
df_esg_high = df[df['avg_esg'] >= df['avg_esg'].median()]

In [252]:
# check how many rows are in each dataframe
print(df_mcap_low.shape)

(2105, 219)


In [None]:
# find the return of each company for each CM period
# take in one dataframe, two integers, and two strings
def get_return(df, start, end, high_low, feature, freq):
    '''
    @param
    df: dataframe
    start: start period
    end: end period
    high_low: 'high' or 'low'
    feature: Default 'Price'
    freq: 'CM' or 'FY' or 'FQ'
    @return
    a dataframe with the return of each company for each CM period
    '''
    for i in range(start+1, end+1):
        df[feature+"_"+'Return_'+ freq + str(i)+"_"+high_low] = (df[feature + "_" + freq + str(i)] - df[feature + "_" + freq + str(i-1)]) / df[feature + "_" + freq  + str(i-1)]
    return df

In [291]:
# Market cap:
df_mcap_low_return = get_return(df_mcap_low, 0, 135, 'low', 'Price', 'CM')
df_mcap_high_return = get_return(df_mcap_high, 0, 135, 'high', 'Price', 'CM')

# ESG score:
df_esg_low_return = get_return(df_esg_low, 0, 135, 'low', 'Price', 'CM')
df_esg_high_return = get_return(df_esg_high, 0, 135, 'high', 'Price', 'CM')

# Price / Book Value Per Share:
df_BM_low_return = get_return(df_BM_low, 0, 135, 'low', 'Price', 'CM')
df_BM_high_return = get_return(df_BM_high, 0, 135, 'high', 'Price', 'CM')

In [307]:
# function to only include the last 135 columns (price data) and the second and third columns (company name and industry)
def get_price_data(df):
    '''
    @param
    df: dataframe
    @return
    a dataframe with only the price data and company name and industry, with NaNs dropped
    '''
    return df.iloc[:, [1, 2] + list(range(-135, 0))].dropna()

In [308]:
# market cap
df_mcap_low_dataset = get_price_data(df_mcap_low_return)
df_mcap_high_dataset = get_price_data(df_mcap_high_return)

# ESG score
df_esg_low_dataset = get_price_data(df_esg_low_return)
df_esg_high_dataset = get_price_data(df_esg_high_return)

# Price / Book Value Per Share
df_BM_low_dataset = get_price_data(df_BM_low_return)
df_BM_high_dataset = get_price_data(df_BM_high_return)


In [309]:
df_mcap_low_dataset

Unnamed: 0,Company Name,Industry,Price_Return_CM1_low,Price_Return_CM2_low,Price_Return_CM3_low,Price_Return_CM4_low,Price_Return_CM5_low,Price_Return_CM6_low,Price_Return_CM7_low,Price_Return_CM8_low,...,Price_Return_CM126_low,Price_Return_CM127_low,Price_Return_CM128_low,Price_Return_CM129_low,Price_Return_CM130_low,Price_Return_CM131_low,Price_Return_CM132_low,Price_Return_CM133_low,Price_Return_CM134_low,Price_Return_CM135_low
7,Eastern Co,Machinery,-0.170253,0.154467,0.028457,-0.012621,0.044739,0.087059,0.008225,0.029197,...,0.400924,-0.175258,-0.040000,0.049479,-0.002481,-0.026368,0.037302,-0.100985,0.069041,-0.080472
9,Electro-Sensors Inc,"Electronic Equipment, Instruments & Components",0.065263,0.148221,0.039587,0.000000,-0.200331,0.012422,0.022495,0.030000,...,0.026506,-0.058685,-0.024938,0.007673,-0.015228,0.064433,-0.113801,-0.021858,0.189944,0.035211
15,Genasys Inc,Communications Equipment,-0.028070,0.054152,0.147260,-0.029851,0.183077,-0.183355,-0.124204,0.178182,...,0.038168,0.132353,-0.077922,0.000000,0.049296,0.194631,0.084270,-0.046632,0.222826,0.280000
50,Fonar Corp,Health Care Equipment & Supplies,-0.121118,-0.003534,0.028369,0.047586,0.061224,0.107320,0.039216,-0.068464,...,0.349057,-0.531469,-0.253731,-0.155000,-0.023669,0.072727,-0.028249,0.110465,-0.005236,0.042105
54,Frequency Electronics Inc,"Electronic Equipment, Instruments & Components",0.116505,0.128696,0.016949,0.106061,0.031507,0.077025,0.072750,0.049425,...,-0.024235,0.081046,-0.021765,0.051916,-0.099882,-0.014360,0.095364,0.031439,0.218054,0.007700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,Nano Mobile Healthcare Inc,Health Care Technology,-0.142857,-0.333333,0.000000,0.000000,0.062500,-0.058824,0.250000,-0.200000,...,0.272727,0.000000,0.250000,-0.142857,-0.100000,2.444444,0.827957,-0.117647,-0.066667,1.285714
2634,Firsthand Technology Value Fund Inc,Capital Markets,0.015873,0.140625,0.116438,0.122699,0.601093,0.058020,0.119355,-0.005764,...,0.374673,0.503236,-0.368954,-0.166934,-0.299133,-0.015808,0.092179,-0.047954,0.044997,0.004499
2636,SandRidge Mississippian Trust I,"Oil, Gas & Consumable Fuels",0.050000,0.047619,0.022727,0.044444,0.106383,0.000000,0.053846,0.021898,...,0.196317,-0.082820,0.072843,0.062578,-0.085689,-0.130113,0.007775,-0.173769,0.158737,0.096316
2640,VOC Energy Trust,"Oil, Gas & Consumable Fuels",-0.176399,0.070901,0.044138,-0.142668,0.198767,-0.064267,-0.060440,0.014620,...,0.179769,-0.017770,-0.018544,-0.023041,0.046226,-0.064923,0.073770,-0.064661,0.072492,0.011638


In [381]:
# this function find the average return for each CM period for each group
def get_return_avg(df, high_low):
    '''
    @param
    df: dataframe
    high_low: 'high' or 'low'
    @return
    a dataframe with the average return for each CM period for each group
    '''
    return df.loc[:,'Price_Return_CM1_'+high_low:'Price_Return_CM135_'+high_low].mean(axis=0).to_frame().T

def get_return_spread(df_low, df_high):
    '''
    @param
    df_low: dataframe of the low group
    df_high: dataframe of the high group
    @return
    a dataframe of the difference in average returns between the high and low groups
    '''
    # for each CM period, find the difference in average returns between the high and low groups (COLUMNS) and put them in a column with as SMB_1, SMB_2, etc.
    df_spread = pd.DataFrame()
    for i in range(0, 135):
        df_spread['SMB_'+str(i+1)] = df_low.iloc[:,i] - df_high.iloc[:,i]
    return df_spread
    


In [385]:
# calculate the difference in returns between each high and low group
# market cap
df_mcap_low_return_avg = get_return_avg(df_mcap_low_dataset, 'low')
df_mcap_high_return_avg = get_return_avg(df_mcap_high_dataset, 'high')
df_SMB_spread_mcap = get_return_spread(df_mcap_low_return_avg, df_mcap_high_return_avg).T

# ESG score
df_esg_low_return_avg = get_return_avg(df_esg_low_dataset, 'low')
df_esg_high_return_avg = get_return_avg(df_esg_high_dataset, 'high')
df_ESG_spread_esg = get_return_spread(df_esg_low_return_avg, df_esg_high_return_avg).T

# Price / Book Value Per Share
df_BM_low_return_avg = get_return_avg(df_BM_low_dataset, 'low')
df_BM_high_return_avg = get_return_avg(df_BM_high_dataset, 'high')
df_HML_spread_BM = get_return_spread(df_BM_low_return_avg, df_BM_high_return_avg).T

In [378]:
df_HML_spread_BM

In [383]:
df_HML_spread_BM.head()

Unnamed: 0,SMB_1,SMB_2,SMB_3,SMB_4,SMB_5,SMB_6,SMB_7,SMB_8,SMB_9,SMB_10,...,SMB_126,SMB_127,SMB_128,SMB_129,SMB_130,SMB_131,SMB_132,SMB_133,SMB_134,SMB_135
0,-0.254069,-0.261579,-0.475841,0.002976,0.028968,-0.043245,-0.2868,0.642968,-0.005768,-0.096862,...,0.040744,-0.019691,0.007909,0.009587,0.005856,-0.032412,-0.034395,-0.032997,0.008017,-0.011117


In [353]:
# now calculate the return spread for each group
# for market cap:
# return spread is calculated as 
df_mcap_return_spread

Unnamed: 0,Price_Return_CM100_high,Price_Return_CM100_low,Price_Return_CM101_high,Price_Return_CM101_low,Price_Return_CM102_high,Price_Return_CM102_low,Price_Return_CM103_high,Price_Return_CM103_low,Price_Return_CM104_high,Price_Return_CM104_low,...,Price_Return_CM96_high,Price_Return_CM96_low,Price_Return_CM97_high,Price_Return_CM97_low,Price_Return_CM98_high,Price_Return_CM98_low,Price_Return_CM99_high,Price_Return_CM99_low,Price_Return_CM9_high,Price_Return_CM9_low
0,,,,,,,,,,,...,,,,,,,,,,


In [203]:
# rank the companies and find the medium
# split the data into two groups, one with the companies that have a market cap above the median, and one with the companies that have a market cap below the median
df_avg['Rank'] = df_avg['Mkt. Cap (M)'].rank(ascending=False)
df_avg_low = df_avg[df_avg['Rank'] > df_avg['Rank'].median()]
df_avg_high = df_avg[df_avg['Rank'] <= df_avg['Rank'].median()]

# count the number of companies in each group
print('Number of companies with a market cap above the median: ', len(df_avg_high))
print('Number of companies with a market cap below the median: ', len(df_avg_low))


Number of companies with a market cap above the median:  5356
Number of companies with a market cap below the median:  5355
