In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [410]:
# May take a few seconds to run since the dataset is large
df = pd.read_excel('stock_data_super_large.xlsx')

In [411]:
# We have around 4000 companies in the dataset, 188 columns 
df.head()

Unnamed: 0,Identifier (RIC),Company Name,Industry,ESG_FY0,ESG_FY1,ESG_FY2,ESG_FY3,ESG_FY4,ESG_FY5,ESG_FY6,...,MCAP_FQ22,MCAP_FQ23,MCAP_FQ24,MCAP_FQ25,MCAP_FQ26,MCAP_FQ27,MCAP_FQ28,MCAP_FQ29,MCAP_FQ30,MCAP_FQ31
0,B,Barnes Group Inc,Machinery,47.280059,39.716448,39.364964,33.635145,24.153344,25.935442,18.761357,...,2758021000.0,2551881000.0,2183471000.0,1778151000.0,1880405000.0,1942959000.0,1971735000.0,2136121000.0,2218038000.0,2015078000.0
1,ITT.N,ITT Inc,Machinery,78.612516,70.72285,68.409017,59.747878,58.30117,54.710253,58.075415,...,3637016000.0,3401874000.0,3211264000.0,2881398000.0,3313802000.0,3250640000.0,2988642000.0,3732128000.0,3607767000.0,3706136000.0
2,GTLS.N,Chart Industries Inc,Machinery,56.790911,43.96815,41.893694,16.687256,21.707709,24.478215,,...,1073148000.0,1102080000.0,1004369000.0,738185900.0,664365700.0,548579300.0,586763300.0,1091904000.0,1071162000.0,1042462000.0
3,PKOH.OQ,Park Ohio Holdings Corp,Machinery,11.351879,12.015005,12.092304,12.603972,12.932632,2.580297,,...,451467100.0,527716200.0,457563000.0,347386500.0,538197000.0,465400100.0,364180200.0,605506300.0,658141800.0,788570800.0
4,ZWS.N,Zurn Elkay Water Solutions Corp,Building Products,61.015179,59.600678,42.460351,22.93073,21.765882,22.323763,,...,2371729000.0,2013093000.0,2199966000.0,1996711000.0,2049765000.0,1818622000.0,1703909000.0,2434572000.0,2714624000.0,2868755000.0


In [412]:
# find the average market cap for each company, column lables MCAP_FY0 ~ MCAP_FY7
df['avg_mcap'] = df.loc[:,'MCAP_FQ0':'MCAP_FQ31'].mean(axis=1)
# same for ESG score
df['avg_esg'] = df.loc[:,'ESG_FY0':'ESG_FY6'].mean(axis=1)
# same for price
df['avg_price'] = df.loc[:,'Price_CM0':'Price_CM96'].mean(axis=1)
# book to market ratio
df['avg_btm'] = df.loc[:,'BTM_FQ0':'BTM_FQ31'].mean(axis=1)

In [413]:
# produce two dataframes, one with market cap lower than the median, one with market cap higher than the median
df_mcap_low = df[df['avg_mcap'] < df['avg_mcap'].median()]
df_mcap_high = df[df['avg_mcap'] >= df['avg_mcap'].median()]

# produce two dataframes, one with Price / Book Value Per Share lower than the median, one with Price / Book Value Per Share higher than the median
df_BM_low = df[df['avg_btm'] < df['avg_btm'].median()]
df_BM_high = df[df['avg_btm'] >= df['avg_btm'].median()]

# now for ESG score
df_esg_low = df[df['avg_esg'] < df['avg_esg'].median()]
df_esg_high = df[df['avg_esg'] >= df['avg_esg'].median()]

In [414]:
# check how many rows are in each dataframe
print(df_mcap_low.shape)

(2105, 219)


In [415]:
# find the return of each company for each CM period
# take in one dataframe, two integers, and two strings
def get_return(df, start, end, high_low, feature, freq):
    '''
    @param
    df: dataframe
    start: start period
    end: end period
    high_low: 'high' or 'low'
    feature: Default 'Price'
    freq: 'CM' or 'FY' or 'FQ'
    @return
    a dataframe with the return of each company for each CM period
    '''
    for i in range(start+1, end+1):
        df[feature+"_"+'Return_'+ freq + str(i)+"_"+high_low] = (df[feature + "_" + freq + str(i)] - df[feature + "_" + freq + str(i-1)]) / df[feature + "_" + freq  + str(i-1)]
    return df

In [416]:
# Market cap:
df_mcap_low_return = get_return(df_mcap_low, 0, 96, 'low', 'Price', 'CM')
df_mcap_high_return = get_return(df_mcap_high, 0, 96, 'high', 'Price', 'CM')

# ESG score:
df_esg_low_return = get_return(df_esg_low, 0, 96, 'low', 'Price', 'CM')
df_esg_high_return = get_return(df_esg_high, 0, 96, 'high', 'Price', 'CM')

# Price / Book Value Per Share:
df_BM_low_return = get_return(df_BM_low, 0, 96, 'low', 'Price', 'CM')
df_BM_high_return = get_return(df_BM_high, 0, 96, 'high', 'Price', 'CM')

In [417]:
# function to only include the last 96 columns (price data) and the second and third columns (company name and industry)
def get_price_data(df):
    '''
    @param
    df: dataframe
    @return
    a dataframe with only the price data and company name and industry, with NaNs dropped
    '''
    return df.iloc[:, [1, 2] + list(range(-96, 0))].dropna()

In [418]:
# market cap
df_mcap_low_dataset = get_price_data(df_mcap_low_return)
df_mcap_high_dataset = get_price_data(df_mcap_high_return)

# ESG score
df_esg_low_dataset = get_price_data(df_esg_low_return)
df_esg_high_dataset = get_price_data(df_esg_high_return)

# Price / Book Value Per Share
df_BM_low_dataset = get_price_data(df_BM_low_return)
df_BM_high_dataset = get_price_data(df_BM_high_return)


In [419]:
df_mcap_low_dataset.head()

Unnamed: 0,Company Name,Industry,Price_Return_CM1_low,Price_Return_CM2_low,Price_Return_CM3_low,Price_Return_CM4_low,Price_Return_CM5_low,Price_Return_CM6_low,Price_Return_CM7_low,Price_Return_CM8_low,...,Price_Return_CM87_low,Price_Return_CM88_low,Price_Return_CM89_low,Price_Return_CM90_low,Price_Return_CM91_low,Price_Return_CM92_low,Price_Return_CM93_low,Price_Return_CM94_low,Price_Return_CM95_low,Price_Return_CM96_low
7,Eastern Co,Machinery,-0.170253,0.154467,0.028457,-0.012621,0.044739,0.087059,0.008225,0.029197,...,0.055283,0.065593,0.044816,0.031525,0.00501,-0.019442,-0.038129,-0.093552,-0.019242,-0.029727
9,Electro-Sensors Inc,"Electronic Equipment, Instruments & Components",0.065263,0.148221,0.039587,0.0,-0.200331,0.012422,0.022495,0.03,...,0.0,0.059126,0.043689,0.027907,-0.081448,0.009852,-0.002439,-0.02445,-0.010025,-0.048101
15,Genasys Inc,Communications Equipment,-0.02807,0.054152,0.14726,-0.029851,0.183077,-0.183355,-0.124204,0.178182,...,-0.085561,0.163743,0.18593,0.050847,-0.052419,0.12766,0.128302,-0.09699,0.018519,-0.003636
50,Fonar Corp,Health Care Equipment & Supplies,-0.121118,-0.003534,0.028369,0.047586,0.061224,0.10732,0.039216,-0.068464,...,0.01782,0.089083,0.023168,0.046211,0.158127,0.015256,-0.237415,0.024631,0.139423,0.064979
54,Frequency Electronics Inc,"Electronic Equipment, Instruments & Components",0.116505,0.128696,0.016949,0.106061,0.031507,0.077025,0.07275,0.049425,...,-0.003575,0.011659,0.143617,-0.014729,0.097561,-0.122581,-0.068219,0.002192,-0.068241,0.065728


In [420]:
# this function find the average return for each CM period for each group
def get_return_avg(df, high_low):
    '''
    @param
    df: dataframe
    high_low: 'high' or 'low'
    @return
    a dataframe with the average return for each CM period for each group
    '''
    return df.loc[:,'Price_Return_CM1_'+high_low:'Price_Return_CM96_'+high_low].mean(axis=0).to_frame().T

def get_return_spread(df_low, df_high):
    '''
    @param
    df_low: dataframe of the low group
    df_high: dataframe of the high group
    @return
    a dataframe of the difference in average returns between the high and low groups
    '''
    # for each CM period, find the difference in average returns between the high and low groups (COLUMNS) and put them in a column with as SMB_1, SMB_2, etc.
    df_spread = pd.DataFrame()
    for i in range(0, 96):
        df_spread['CM'+str(i+1)] = df_low.iloc[:,i] - df_high.iloc[:,i]
    return df_spread
    


In [421]:
# calculate the difference in returns between each high and low group
# market cap
df_mcap_low_return_avg = get_return_avg(df_mcap_low_dataset, 'low')
df_mcap_high_return_avg = get_return_avg(df_mcap_high_dataset, 'high')
df_SMB_spread_mcap = get_return_spread(df_mcap_low_return_avg, df_mcap_high_return_avg).T.rename(columns={0: 'SMB'})

# ESG score
df_esg_low_return_avg = get_return_avg(df_esg_low_dataset, 'low')
df_esg_high_return_avg = get_return_avg(df_esg_high_dataset, 'high')
df_ESG_spread_esg = get_return_spread(df_esg_low_return_avg, df_esg_high_return_avg).T.rename(columns={0: 'ESG'})

# Price / Book Value Per Share
df_BM_low_return_avg = get_return_avg(df_BM_low_dataset, 'low')
df_BM_high_return_avg = get_return_avg(df_BM_high_dataset, 'high')
df_HML_spread_BM = get_return_spread(df_BM_low_return_avg, df_BM_high_return_avg).T.rename(columns={0: 'HML'})

In [422]:
# merge the three spreads into one dataframe
df_spread = pd.concat([df_SMB_spread_mcap, df_ESG_spread_esg, df_HML_spread_BM], axis=1)

In [433]:
df_spread

Unnamed: 0,SMB,ESG,HML
CM1,0.341720,0.050614,-0.201218
CM2,0.580534,0.071926,-0.220284
CM3,1.221287,-0.004384,-0.841069
CM4,0.418568,0.027457,-0.170619
CM5,-0.001379,0.014261,0.031242
...,...,...,...
CM92,0.105564,0.007348,-0.022740
CM93,0.080234,0.012717,-0.009245
CM94,-0.016720,-0.015733,0.032470
CM95,0.121624,0.006406,-0.012250


In [432]:
# now read in the risk free rate data and SPY_data_final
df_risk_free = pd.read_excel('10_year_yield_US.xls')
df_mkt_return = pd.read_excel('SPY_data_final.xls', sheet_name = 'SPY_data')

In [425]:
# rank the companies and find the medium
# split the data into two groups, one with the companies that have a market cap above the median, and one with the companies that have a market cap below the median
df_avg['Rank'] = df_avg['Mkt. Cap (M)'].rank(ascending=False)
df_avg_low = df_avg[df_avg['Rank'] > df_avg['Rank'].median()]
df_avg_high = df_avg[df_avg['Rank'] <= df_avg['Rank'].median()]

# count the number of companies in each group
print('Number of companies with a market cap above the median: ', len(df_avg_high))
print('Number of companies with a market cap below the median: ', len(df_avg_low))


Number of companies with a market cap above the median:  5356
Number of companies with a market cap below the median:  5355


In [None]:
# A palindrome reads the same from left or right, mom for example.
# change exactly one character of the string to another character in the range ascii[a-z] to that the string meets the following three conditions:
# 1. the new string is lower alphabetically than the initial string
# 2. the new string is teh lowest value string that can be created from the original palindrome
# 3. The new string is not a palindrome
def breakPalindrome(palindromeStr):
    '''
    @param
    palindromeStr: a string
    @return
    the lowest value string that can be created from the original palindrome by changing exactly one character of the string to another character in the range ascii[a-z] to that the string meets the following three conditions:
    1. the new string is lower alphabetically than the initial string
    2. the new string is teh lowest value string that can be created from the original palindrome
    3. The new string is not a palindrome
    '''
    # if the string is empty or has only one character, return 'IMPOSSIBLE'
    if len(palindromeStr) <= 1:
        return 'IMPOSSIBLE'
    
    # if the string has an even number of characters, change the middle character to 'a' if it is not 'a', or change the second last character to 'b' if it is not 'b'
    if len(palindromeStr) % 2 == 0:
        if palindromeStr[len(palindromeStr)//2] != 'a':
            return palindromeStr[:len(palindromeStr)//2] + 'a' + palindromeStr[len(palindromeStr)//2+1:]
        elif palindromeStr[len(palindromeStr)//2-1] != 'b':
            return palindromeStr[:len(palindromeStr)//2-1] + 'b' + palindromeStr[len(palindromeStr)//2:]
        else:
            return 'IMPOSSIBLE'
    
    # if the string has an odd number of characters, change the middle character to 'a' if it is not 'a', or change the second last character to 'b' if it is not 'b'
    else:
        if palindromeStr[len(palindromeStr)//2] != 'a':
            return palindromeStr[:len(palindromeStr)//2] + 'a' + palindromeStr[len(palindromeStr)//2+1:]
        elif palindromeStr[len(palindromeStr)//2-1] != 'b':
            return palindromeStr[:len(palindromeStr)//2-1] + 'b' + palindromeStr[len(palindromeStr)//2:]
        else:
            return 'IMPOSSIBLE'


In [None]:
def breakPalindrome(palindromeStr):
    # check whether a string is palindrome
    def isPalindrome(s):
        return s[::-1] == s
    ret = 'IMPOSSIBLE'
    # iterate over all positions
    for i in range(len(palindromeStr)):
        if palindromeStr[i] != 'a':
            # iterate over all smaller characters
            for j in range(ord('a'), ord(palindromeStr[i])):
                s = palindromeStr[:i] + chr(j) + palindromeStr[i+1:]
                if not isPalindrome(s):
                    if ret == 'IMPOSSIBLE':
                        ret = s
                    # store the smaller one
                    elif ret > s:
                        ret = s
    return ret

In [None]:
def shortestPalindrome(s):
    n = len(s)
    num_insertion = [[0] * n for _ in range(n)]
    for gap in range(1, n):
        l = 0
        h = gap
        while h < n:
            if s[l] == s[h]:
                num_insertion[l][h] = num_insertion[l + 1][h - 1]
            else:
                num_insertion[l][h] = 1 + min(num_insertion[l][h - 1], num_insertion[l + 1][h])
            l += 1
            h += 1
    return num_insertion[0][n - 1]

In [435]:
# Complete the 'predictMissingHumidity' function below.
#
# The function is expected to return a FLOAT_ARRAY.
# The function accepts following parameters:
#  1. STRING startDate
#  2. STRING endDate
#  3. STRING_ARRAY knownTimestamps
#  4. FLOAT_ARRAY humidity
#  5. STRING_ARRAY timestamps
#
import datetime
# import regression
from sklearn.linear_model import LinearRegression

def predictMissingHumidity(startDate, endDate, knownTimestamps, humidity, timestamps):
    # Write your code here
    # convert the date to datetime format
    startDate = datetime.strptime(startDate, '%Y-%m-%d %H:%M:%S')
    endDate = datetime.strptime(endDate, '%Y-%m-%d %H:%M:%S')
    # convert the knownTimestamps to datetime format
    knownTimestamps = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in knownTimestamps]
    # convert the timestamps to datetime format
    timestamps = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in timestamps]
    # create a dictionary to store the humidity values
    humidity_dict = {}
    for i in range(len(knownTimestamps)):
        humidity_dict[knownTimestamps[i]] = humidity[i]
    # create a list to store the predicted humidity values
    predicted_humidity = []
    # iterate over all timestamps
    for timestamp in timestamps:
        # if the timestamp is in the knownTimestamps, append the humidity value to the list
        if timestamp in knownTimestamps:
            predicted_humidity.append(humidity_dict[timestamp])
        # run a regression model to predict the humidity value
        reg = LinearRegression().fit(np.array([x.timestamp() for x in knownTimestamps]).reshape(-1, 1), humidity)
        # get the beta coefficients
        beta = reg.coef_
        # get the intercept
        intercept = reg.intercept_
        # predict the humidity value, before appending it to the list check whether it is within the range
        predicted_humidity.append(max(0, min(100, intercept + beta * timestamp.timestamp())))
    return predicted_humidity




In [None]:
result = predictMissingHumidity(

In [440]:
# generate random input for predict Missing Humidity
# import random
import random

timeStamps_known = []
timeStamps = []
humidity = []
for i in range(19):
    # random time stamps from 2013-01-01 to 2015-01-01, with formate yyy-mm-dd hh:00
    timeStamps_known.append(datetime.datetime(random.randint(2013, 2015), random.randint(1, 12), random.randint(1, 28), random.randint(0, 23), 0))
    # random humidity values
    humidity.append(random.randint(0, 100))

for i in range(19):
    # random time stamps from 2013-01-01 to 2015-01-01
    timeStamps.append(datetime.datetime(random.randint(2013, 2015), random.randint(1, 12), random.randint(1, 28), random.randint(0, 23), 0))


# get the first day of the timeStamps_known
startDate = min(timeStamps_known)
# get the last day of the timeStamps_known
endDate = max(timeStamps_known)
# convert the timeStamps_known to string format

print(startDate)


2013-02-16 23:00:00
