In [1]:
import pandas as pd
import numpy as np
import datetime
import math
import quandl
pd.set_option('display.max_columns', 500)

## Read and Clean Data

In [2]:
def remove_foreign_reit_adr():
    '''
    @author: Wuding Li
    @The function returns a list of CUSIPs which not been filtered out
    @Reference: The function get the REIT and Foreign Stock filter through the Quandl database
    '''
    quandl.ApiConfig.api_key = "wJQ9TBnyt36w1a-dZjDg"
    CUSIPdic = quandl.get_table("SHARADAR/TICKERS", paginate=True)
    #get all domestic stock
    domestic = CUSIPdic[CUSIPdic['category'] == 'Domestic']
    #remove REIT
    keep = domestic[domestic['sicindustry'] != 'Real Estate Investment Trusts']
    #get CUSIP for all those keep ticker
    keep_list = keep['cusips'].values.tolist()
    #remove none in list
    res = [i for i in keep_list if i] 
    #seperate by space
    temp = " ".join(res).split(' ')
    #remove the last char in each elements to match CUSIP in dsf
    output = [x[:-1] for x in temp]
    return output

In [None]:
#read in crsp dsf data
crsp = pd.read_csv('mydata_dsf.csv')
#apply the first filter
keep = remove_foreign_reit_adr()
crsp = crsp[crsp['CUSIP'].isin(keep)]
crsp['DATE'] =  pd.to_datetime(crsp['DATE'])

### Apply more filters and remove missing data

In [3]:
def check_closed_end_fund(data):
    '''
    @author: Wuding Li
    @The function return a list of PERMNO with no closed end fund
    @Variable: data: the input dataset
    '''
    
    date = data['DATE'].values
    s_date = [date[0],date[1],date[2]]
    PERMNO_list_start = (data[data['DATE'].isin(s_date)])['PERMNO'].values.tolist()
    
    e_date = [date[-1],date[-2],date[-3]]
    PERMNO_list_end = (data[data['DATE'].isin(e_date)])['PERMNO'].values.tolist()
    
    temp = set(PERMNO_list_end) 
    output = [value for value in PERMNO_list_start if value in temp] 
    
    return output

In [4]:
def check_more_than_5(data):
    '''
    @author: Wuding Li
    @The function returns a list of PERMNO which it's price in larger than $5 in the sampling and testing period
    @Variable: data: the input dataset
    '''
    mask = (data['PRC'] < 5)
    output = list(set((data.loc[mask])['PERMNO'].values.tolist()))
    return output

In [5]:
def remove_missing(data):
    '''
    @author: Wuding Li
    @The function returns a list of PERMMO that has missing values
    @Variable: data: the input dataset
    '''
    return list(set(data[data.isnull().any(axis=1)]['PERMNO'].values))

Above are all steps and functions for the raw dataset wrangling. The function{check_closed_end_fund}, function{check_more_than_5} and function{remove_missing} are called in the function{rolling_time_window} to save computational time and increase computation efficiency

## Generating samples 

In [6]:
def rolling_time_window(Start_Month, Start_Year, Time_Period_Training, Time_Period_Testing, Gap, data):
    '''
    @author: Wuding Li
    @The function returns two cleaned datasets. One is the sampling period data and the other is testing period data
    @variables: Start_Month: the starting month of our rolling time period
                Start_Year: the starting year of our rolling time period
                Time_Period_Training: Span of sampling Period
                Time_Period_Testing: Span of testing Period
                Gap: Gap between sampling and testing
                Data: The input data dataset
    @Also this function apply the filter no closed end fund and stock price more than 5 dollars
    '''
    
    #get the start and end date of data
    start_date_train = str(pd.date_range(start=str(Start_Month)+'/1/'+str(Start_Year), periods=1, freq='D')[0].date())
    end_date_train = str(pd.date_range(start=str(Start_Month)+'/1/'+str(Start_Year), periods=2, freq=str(Time_Period_Training-1)+'M')[-1].date())
    mask_train = (data['DATE'] >= start_date_train) & (data['DATE'] <= end_date_train)
    
    
    start_month_testing = Start_Month+Time_Period_Training+Gap
    start_year_testing = Start_Year + math.floor(start_month_testing/12)
    start_month_testing = start_month_testing % 12    
    start_date_test = str(pd.date_range(start=str(start_month_testing)+'/1/'+str(start_year_testing), periods=1, freq='D')[0].date())
    end_date_test = str(pd.date_range(start=str(start_month_testing)+'/1/'+str(start_year_testing), periods=2, freq=str(Time_Period_Testing-1)+'M')[-1].date())
    mask_test = (data['DATE'] >= start_date_test) & (data['DATE'] <= end_date_test)
    
    #applying filter functions below on the master dataset
    mask_master = (data['DATE'] >= start_date_train) & (data['DATE'] <= end_date_test)
    data_master = data.loc[mask_master]
    
    #check closed end fund
    keep_list = check_closed_end_fund(data_master) 
    #check more than 5 and missing data
    delete_list = check_more_than_5(data_master) + remove_missing(data_master)
    
    #comprehend both list
    keep_list = [x for x in keep_list if x not in delete_list]
    
    data_master = data_master[data_master['PERMNO'].isin(keep_list)]
    #remove strings in columns
    data_master[['RET']] = data_master[['RET']].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float).dropna()
    data_master[['RETX']] = data_master[['RETX']].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float).dropna()

    
    return data_master.loc[mask_train],data_master.loc[mask_test]

In [7]:
def get_monthly_ret_dsf(data):
    '''
    @author: Wuding Li
    @The function returns a dataset that convert daily return to monthly return
    @Variable: data: the input dataset
    '''
    data['ret'] = data['RET'].astype(float).apply(lambda x: x+1)
    data['YearMonth'] = data['DATE'].apply(lambda x:x.strftime('%Y-%m'))
    temp = data.groupby(['PERMNO','YearMonth']).agg({'ret': 'prod',
                                          'VOL': 'sum'}).reset_index()
    temp['ret'] = temp['ret'] - 1
    return temp

## Test Fama French Factors' Exposure 
### Run regression

In [8]:
def get_weight(data):
    '''
    @author: Wuding Li
    @The function add a column to the data set that assigned the weight to each stock for each month
    @Variable: data: the input dataset (Monthly data)                    
    '''
    count = len(set(data['PERMNO'].values))
    data['weight']=data.groupby(['YearMonth'])['ret'].apply(lambda x: (x-x.mean())*(-1/count))
    return data

### check missing values

In [274]:
#missing values check
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0,len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    return dict_x
#features = train.iloc[:,-1:-6]
missing = percent_missing(train)
df_missing = sorted(missing.items(), key = lambda x: x[1], reverse = True)
print('percentage of missing data')
df_missing[0:10]

percentage of missing data


[('PERMNO', 0.0),
 ('DATE', 0.0),
 ('CUSIP', 0.0),
 ('PRC', 0.0),
 ('VOL', 0.0),
 ('ASKHI', 0.0),
 ('BIDLO', 0.0),
 ('RET', 0.0),
 ('RETX', 0.0),
 ('YEAR', 0.0)]

### create a sample to test

# Regression

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as seabornInstance
pd.set_option('mode.chained_assignment', None)
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [110]:
# Reading dataset from fama french and tickers
ff_data = pd.read_csv("fama_french.csv")
ticker_data = pd.read_csv("sample.csv")
print(ff_data.shape)
print(ticker_data.shape)

# Getting and renaming the columns we need
ff_ret = ff_data[['YearMonth','Mkt-RF','SMB','HML','RF']]
ff_ret = ff_ret.rename(columns = {"Mkt-RF": "mkt_ret"})
# converting the factor exposures to percentage
ff_ret["mkt_ret"] = ff_ret["mkt_ret"]/100
ff_ret["SMB"] = ff_ret["SMB"]/100
ff_ret["HML"] = ff_ret["HML"]/100
ff_ret["RF"] = ff_ret["RF"]/100
ff_ret["YearMonth"] = ff_ret["YearMonth"].astype(str) #converting YearMonth to string
print(ff_ret.head())

# Getting and renaming the columns we need, includes proper formatting
ticker_data['YearMonth'] = ticker_data['YearMonth'].apply(lambda x: x[:4] + x[-2:])
ticker_ret = ticker_data[['YearMonth','PERMNO','ret']]
ticker_ret = ticker_ret.rename(columns = {"ret": "ticker_ret"})
ticker_ret["PERMNO"] = ticker_ret["PERMNO"].astype(str)
print(ticker_ret.head())

# Merging the two dataset into one combined return dataset, which will be used as regression input
combined_ret = pd.merge(ff_ret, ticker_ret, how='right', on=['YearMonth'])
# calculates the excess return of tickers by subtracting the returns by risk free rate
combined_ret['ticker_excess_ret'] = combined_ret['ticker_ret'] - combined_ret['RF']
print(combined_ret.head())

(1117, 5)
(14256, 6)
  YearMonth  mkt_ret     SMB     HML      RF
0    192607   0.0296 -0.0230 -0.0287  0.0022
1    192608   0.0264 -0.0140  0.0419  0.0025
2    192609   0.0036 -0.0132  0.0001  0.0023
3    192610  -0.0324  0.0004  0.0051  0.0032
4    192611   0.0253 -0.0020 -0.0035  0.0031
  YearMonth PERMNO  ticker_ret
0    198601  10137    0.056602
1    198602  10137    0.089285
2    198603  10137    0.041489
3    198604  10137   -0.003205
4    198605  10137    0.048230
  YearMonth  mkt_ret     SMB     HML      RF PERMNO  ticker_ret  \
0    198601   0.0065  0.0122  0.0053  0.0056  10137    0.056602   
1    198601   0.0065  0.0122  0.0053  0.0056  10225    0.038138   
2    198601   0.0065  0.0122  0.0053  0.0056  10364    0.115647   
3    198601   0.0065  0.0122  0.0053  0.0056  10401   -0.145000   
4    198601   0.0065  0.0122  0.0053  0.0056  10516    0.062499   

   ticker_excess_ret  
0           0.051002  
1           0.032538  
2           0.110047  
3          -0.150600  
4    

In [101]:
#temp = combined_ret.loc[combined_ret['PERMNO']=='10137']
#temp

Unnamed: 0,YearMonth,mkt_ret,SMB,HML,RF,PERMNO,ticker_ret,ticker_excess_ret
0,198601,0.0065,0.0122,0.0053,0.0056,10137,0.0566022,0.051002
396,198602,0.0713,-0.0065,-0.0094,0.0053,10137,0.08928543,0.083985
792,198603,0.0488,-0.0052,-0.0044,0.006,10137,0.04148926,0.035489
1188,198604,-0.0131,0.0284,-0.0285,0.0052,10137,-0.003205256,-0.008405
1584,198605,0.0462,-0.0132,-0.0011,0.0049,10137,0.0482298,0.04333
1980,198606,0.0103,-0.0091,0.014,0.0052,10137,0.1045981,0.099398
2376,198607,-0.0645,-0.0338,0.0478,0.0052,10137,0.07344654,0.068247
2772,198608,0.0607,-0.0417,0.0352,0.0046,10137,0.07368441,0.069084
3168,198609,-0.086,0.0228,0.0319,0.0045,10137,-0.09153421,-0.096034
3564,198610,0.0466,-0.0248,-0.0132,0.0046,10137,0.03561779,0.031018


In [91]:
def regression_model(combined_temp):
    '''
    @author: Robin Lam
    Given each stock i, this function runs the regression model.
    Input:
        combined_temp: dataframe which contains returns and factor exposures
            variables: YearMonth, mkt_ret, SMB, HML, RF, PERMNO, ticker_ret, ticker_excess_ret
    Output:
        combined_temp: same dataframe as the input, but with an extra column (residual)
        coef: list of coefficients from the regression for each stock i. should have 3 betas.
    '''
    output = {}
    x = combined_temp[['mkt_ret','SMB','HML']]
    y = combined_temp['ticker_excess_ret']
    
    # Linear regression model
    regressor = LinearRegression(fit_intercept=True)  
    regressor.fit(x, y) #training the algorithm
    
    # retrieving the intercept
    alpha = regressor.intercept_
    # retrieving the slope
    coef = regressor.coef_
    
    # calculating the residuals for each regression model
    combined_temp['residual'] = y - x['mkt_ret']*coef[0] + x['SMB']*coef[1] + x['HML']*coef[2]
    
    # making prediction
    #y_pred = regressor.predict(x_test)
    # prediction comparison
    #df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
    # plotting prediction comparison
    #plt.scatter(x_test, y_test,  color='gray')
    #plt.plot(x_test, y_pred, color='red', linewidth=2)
    #plt.show()
    
    return combined_temp, coef

In [81]:
combined_ret, coef= regression_model(temp)

[ 0.47949864 -0.89536562  0.39820026]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [86]:
dict_coef = {}
permno_list = list(set(combined_ret['PERMNO'].values.tolist()))
dict_coef = {permno_list[0]: list(coef)}
dict_coef

{'10137': [0.4794986369669874, -0.8953656230874513, 0.39820025670280446]}

In [102]:
def all_regression(df):
    '''
    @author: Robin Lam
    Given each stock i, this function runs the regression model.
    Input:
        df: dataframe which contains ALL of the stocks' returns and factor exposures
            variables: YearMonth, mkt_ret, SMB, HML, RF, PERMNO, ticker_ret, ticker_excess_ret
    Output:
        output: same dataframe as the input, but with an extra column (residual)
        dict_coef: dictionary of coefficients from all the regression for ALL stocks. should have 3 betas.
    '''
    dict_ceof = {}
    output = pd.DataFrame()
    permno_list = list(set(df['PERMNO'].values.tolist()))
    for i in permno_list:
        temp = df[df['PERMNO'] == i]
        combined_ret, coef = regression_model(temp)
        dict_coef[i] = coef
        output = output.append(combined_ret)
    return output, dict_coef

In [112]:
output, dict_coef = all_regression(combined_ret)

In [117]:
dict_coef

{'10137': array([ 0.47949864, -0.89536562,  0.39820026]),
 '52978': array([1.19258894, 1.40028496, 0.21945169]),
 '11260': array([ 2.11481494, -0.73453078,  1.13938927]),
 '45241': array([ 1.09492389,  0.18421524, -0.62576908]),
 '39765': array([1.71544959, 1.30631406, 1.77800304]),
 '16548': array([0.58902253, 0.66496816, 0.11003536]),
 '13928': array([1.1065545 , 0.40887869, 0.33542244]),
 '51721': array([ 1.32208747,  0.88954317, -0.33466089]),
 '14891': array([1.18909636, 0.45621785, 0.70961322]),
 '64531': array([1.08116225, 1.00856782, 0.96382299]),
 '20124': array([1.36519021, 0.51006578, 1.46159999]),
 '32563': array([ 0.91748744,  1.34242606, -0.68320071]),
 '15991': array([ 1.30342435, -0.53452372, -2.32990873]),
 '66181': array([ 1.6083566 ,  0.19941354, -0.33248264]),
 '21960': array([ 1.55683737, -0.07523271,  0.92075343]),
 '37197': array([ 0.6895618 , -0.56167004, -1.39688438]),
 '52265': array([ 0.93438265,  0.15716647, -0.28750555]),
 '51706': array([ 0.80411434,  0.27

In [118]:
output

Unnamed: 0,YearMonth,mkt_ret,SMB,HML,RF,PERMNO,ticker_ret,ticker_excess_ret,residual
272,198601,0.0065,0.0122,0.0053,0.0056,52978,6.590930e-02,0.060309,0.070804
668,198602,0.0713,-0.0065,-0.0094,0.0053,52978,1.418931e-01,0.136593,0.040397
1064,198603,0.0488,-0.0052,-0.0044,0.0060,52978,1.331355e-01,0.127136,0.060690
1460,198604,-0.0131,0.0284,-0.0285,0.0052,52978,2.432619e-02,0.019126,0.068263
1856,198605,0.0462,-0.0132,-0.0011,0.0049,52978,1.020610e-02,0.005306,-0.068517
2252,198606,0.0103,-0.0091,0.0140,0.0052,52978,1.717165e-01,0.166517,0.144563
2648,198607,-0.0645,-0.0338,0.0478,0.0052,52978,-5.091771e-02,-0.056118,-0.016036
3044,198608,0.0607,-0.0417,0.0352,0.0046,52978,-4.546209e-03,-0.009146,-0.132204
3440,198609,-0.0860,0.0228,0.0319,0.0045,52978,-1.826485e-02,-0.022765,0.118725
3836,198610,0.0466,-0.0248,-0.0132,0.0046,52978,-1.665880e-01,-0.171188,-0.264386
