In [17]:
import pandas as pd
import numpy as np
import datetime
import math
import quandl
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt  
import seaborn as seabornInstance
pd.set_option('mode.chained_assignment', None)
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

## Read and Clean Data

In [43]:
def remove_foreign_reit_adr():
    '''
    @author: Wuding Li
    @The function returns a list of CUSIPs which not been filtered out
    @Reference: The function get the REIT and Foreign Stock filter through the Quandl database
    '''
    quandl.ApiConfig.api_key = "wJQ9TBnyt36w1a-dZjDg"
    CUSIPdic = quandl.get_table("SHARADAR/TICKERS", paginate=True)
    #get all domestic stock
    domestic = CUSIPdic[CUSIPdic['category'] == 'Domestic']
    #remove REIT
    keep = domestic[domestic['sicindustry'] != 'Real Estate Investment Trusts']
    #get CUSIP for all those keep ticker
    keep_list = keep['cusips'].values.tolist()
    #remove none in list
    res = [i for i in keep_list if i] 
    #seperate by space
    temp = " ".join(res).split(' ')
    #remove the last char in each elements to match CUSIP in dsf
    output = [x[:-1] for x in temp]
    return output

In [5]:
#read in crsp dsf data
crsp = pd.read_csv('mydata_dsf.csv')
#apply the first filter
keep = remove_foreign_reit_adr()
crsp = crsp[crsp['CUSIP'].isin(keep)]
crsp['DATE'] =  pd.to_datetime(crsp['DATE'])

  interactivity=interactivity, compiler=compiler, result=result)


### Apply more filters and remove missing data

In [44]:
def check_closed_end_fund(data):
    '''
    @author: Wuding Li
    @The function return a list of PERMNO with no closed end fund
    @Variable: data: the input dataset
    '''
    
    date = data['DATE'].values
    s_date = [date[0],date[1],date[2]]
    PERMNO_list_start = (data[data['DATE'].isin(s_date)])['PERMNO'].values.tolist()
    
    e_date = [date[-1],date[-2],date[-3]]
    PERMNO_list_end = (data[data['DATE'].isin(e_date)])['PERMNO'].values.tolist()
    
    temp = set(PERMNO_list_end) 
    output = [value for value in PERMNO_list_start if value in temp] 
    
    return output

In [45]:
def check_more_than_5(data):
    '''
    @author: Wuding Li
    @The function returns a list of PERMNO which it's price in larger than $5 in the sampling and testing period
    @Variable: data: the input dataset
    '''
    mask = (data['PRC'] < 5)
    output = list(set((data.loc[mask])['PERMNO'].values.tolist()))
    return output

In [46]:
def remove_missing(data):
    '''
    @author: Wuding Li
    @The function returns a list of PERMMO that has missing values
    @Variable: data: the input dataset
    '''
    return list(set(data[data.isnull().any(axis=1)]['PERMNO'].values))

Above are all steps and functions for the raw dataset wrangling. The function{check_closed_end_fund}, function{check_more_than_5} and function{remove_missing} are called in the function{rolling_time_window} to save computational time and increase computation efficiency

## Generating samples 

In [47]:
def rolling_time_window(Start_Month, Start_Year, Time_Period_Training, Time_Period_Testing, Gap, data):
    '''
    @author: Wuding Li
    @The function returns two cleaned datasets. One is the sampling period data and the other is testing period data
    @variables: Start_Month: the starting month of our rolling time period
                Start_Year: the starting year of our rolling time period
                Time_Period_Training: Span of sampling Period
                Time_Period_Testing: Span of testing Period
                Gap: Gap between sampling and testing
                Data: The input data dataset
    @Also this function apply the filter no closed end fund and stock price more than 5 dollars
    
    @Updated: 10/7 fixed the end_date_test "frequency = 0" problem
    '''
    
    #get the start and end date of data
    start_date_train = str(pd.date_range(start=str(Start_Month)+'/1/'+str(Start_Year), periods=1, freq='D')[0].date())
    end_date_train = str(pd.date_range(start=str(Start_Month)+'/1/'+str(Start_Year), periods=2, freq=str(Time_Period_Training-1)+'M')[-1].date())
    mask_train = (data['DATE'] >= start_date_train) & (data['DATE'] <= end_date_train)
    
    
    start_month_testing = Start_Month+Time_Period_Training+Gap
    start_year_testing = Start_Year + math.floor(start_month_testing/12)
    start_month_testing = start_month_testing % 12    
    start_date_test = str(pd.date_range(start=str(start_month_testing)+'/1/'+str(start_year_testing), periods=1, freq='D')[0].date())
    end_date_test = str(pd.date_range(start=str(start_month_testing)+'/1/'+str(start_year_testing), periods=1, freq=str(Time_Period_Testing)+'M')[-1].date())
    mask_test = (data['DATE'] >= start_date_test) & (data['DATE'] <= end_date_test)
    
    #applying filter functions below on the master dataset
    mask_master = (data['DATE'] >= start_date_train) & (data['DATE'] <= end_date_test)
    data_master = data.loc[mask_master]
    
    #check closed end fund
    keep_list = check_closed_end_fund(data_master) 
    #check more than 5 and missing data
    delete_list = check_more_than_5(data_master) + remove_missing(data_master)
    
    #comprehend both list
    keep_list = [x for x in keep_list if x not in delete_list]
    
    data_master = data_master[data_master['PERMNO'].isin(keep_list)]
    #remove strings in columns
    data_master[['RET']] = data_master[['RET']].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float).dropna()
    data_master[['RETX']] = data_master[['RETX']].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float).dropna()

    
    return data_master.loc[mask_train],data_master.loc[mask_test]

In [55]:
def get_monthly_ret_dsf(data):
    '''
    @author: Wuding Li
    @The function returns a dataset that convert daily return to monthly return
    @Variable: data: the input dataset
    '''
    data['ret'] = data['RET'].astype(float).apply(lambda x: x+1)
    data['YearMonth'] = data['DATE'].apply(lambda x:x.strftime('%Y-%m'))
    temp = data.groupby(['PERMNO','YearMonth']).agg({'ret': 'prod',
                                          'VOL': 'sum'}).reset_index()
    temp['ret'] = temp['ret'] - 1
    
    # Getting and renaming the columns we need, includes proper formatting
    temp['YearMonth'] = temp['YearMonth'].apply(lambda x: x[:4] + x[-2:])
    ticker_ret = temp[['YearMonth','PERMNO','ret']]
    #ticker_ret = ticker_ret.rename(columns = {"ret": "ticker_ret"})
    ticker_ret["PERMNO"] = ticker_ret["PERMNO"].astype(str)
    
    return ticker_ret

## Test Fama French Factors' Exposure 
### Run regression

In [49]:
def get_weight(data):
    '''
    @author: Wuding Li
    @The function add a column to the data set that assigned the weight to each stock for each month
    @Variable: data: the input dataset (Monthly data)                    
    '''
    count = len(set(data['PERMNO'].values))
    data['weight']=data.groupby(['YearMonth'])['ret'].apply(lambda x: (x-x.mean())*(-1/count))    
    return data

In [50]:
def regression_model(combined_temp):
    '''
    @author: Robin Lam / Yulin Chen
    Given each stock i, this function runs the regression model.
    Input:
        combined_temp: dataframe which contains returns and factor exposures
            variables: YearMonth, mkt_ret, SMB, HML, RF, PERMNO, ticker_ret, ticker_excess_ret
    Output:
        combined_temp: same dataframe as the input, but with an extra column (residual)
        coef: list of coefficients from the regression for each stock i. should have 3 betas.
    '''
    output = {}
    x = combined_temp[['mkt_ret','SMB','HML']]
    y = combined_temp['ticker_excess_ret']
    
    # Linear regression model
    regressor = LinearRegression(fit_intercept=True)  
    regressor.fit(x, y) #training the algorithm
    
    # retrieving the intercept
    alpha = regressor.intercept_
    # retrieving the slope
    coef = regressor.coef_
    
    # calculating the residuals for each regression model
    combined_temp['residual'] = y - x['mkt_ret']*coef[0] + x['SMB']*coef[1] + x['HML']*coef[2]
    combined_temp['mkt'] = x['mkt_ret']*coef[0]
    combined_temp['SMB'] = x['SMB']*coef[1]
    combined_temp['HML'] = x['HML']*coef[2]
    combined_temp['port_mkt'] = combined_temp['mkt']*combined_temp['weight']
    combined_temp['port_SMB'] = combined_temp['SMB']*combined_temp['weight']
    combined_temp['port_HML'] = combined_temp['HML']*combined_temp['weight']
    winner = combined_temp[combined_temp['weight'] < 0]
    loser = combined_temp[combined_temp['weight'] > 0]
    
    # making prediction
    #y_pred = regressor.predict(x_test)
    # prediction comparison
    #df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
    # plotting prediction comparison
    #plt.scatter(x_test, y_test,  color='gray')
    #plt.plot(x_test, y_pred, color='red', linewidth=2)
    #plt.show()
    
    return winner, loser

In [65]:
def all_regression(df):
    '''
    @author: Robin Lam
    Given each stock i, this function runs the regression model.
    Input:
        df: dataframe which contains ALL of the stocks' returns and factor exposures
            variables: YearMonth, mkt_ret, SMB, HML, RF, PERMNO, ticker_ret, ticker_excess_ret
    Output:
        output: same dataframe as the input, but with an extra column (residual)
        dict_coef: dictionary of coefficients from all the regression for ALL stocks. should have 3 betas.
    '''
    #dict_coef = {}
    output_winner = pd.DataFrame()
    output_loser = pd.DataFrame()
    permno_list = list(set(df['PERMNO'].values.tolist()))
    for i in permno_list:
        temp = df[df['PERMNO'] == i]
        combined_ret_winner,combined_ret_loser, coef = regression_model(temp)
        #dict_coef[i] = coef
        output_winner = output.append(combined_ret_winner)
        output_loser = output.append(combined_ret_loser)
    # winner portfolio
    mkt_winner = output_winner.groupby(['YearMonth'])['port_mkt'].apply(lambda x: sum(x))
    SMB_winner = output_winner.groupby(['YearMonth'])['port_SMB'].apply(lambda x: sum(x))
    HML_winner = output_winner.groupby(['YearMonth'])['port_HML'].apply(lambda x: sum(x))
    
    # loser portfolio
    mkt_loser = output_loser.groupby(['YearMonth'])['port_mkt'].apply(lambda x: sum(x))
    SMB_winner = output_loser.groupby(['YearMonth'])['port_SMB'].apply(lambda x: sum(x))
    HML_winner = output_loser.groupby(['YearMonth'])['port_HML'].apply(lambda x: sum(x))
    
    return mkt_winner,SMB_winner,HML_winner,mkt_loser,SMB_winner,HML_winner

### check missing values

In [26]:
#missing values check
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0,len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    return dict_x
#features = train.iloc[:,-1:-6]
missing = percent_missing(train)
df_missing = sorted(missing.items(), key = lambda x: x[1], reverse = True)
print('percentage of missing data')
df_missing[0:10]

percentage of missing data


[('PERMNO', 0.0),
 ('YearMonth', 0.0),
 ('ret', 0.0),
 ('VOL', 0.0),
 ('weight', 0.0)]

### create a sample to test

### Read in Fama French

In [42]:
def format_FF():
    '''
    @author: Zhikang Wang
    @The function read and clean up the fama_french dataset pulled from Dr. French's website
    @Variable: input is fama_french dataset pulled from Dr. French's website
    '''
    
    ff_data = pd.read_csv("fama_french.csv")
    # Getting and renaming the columns we need
    ff_ret = ff_data[['YearMonth','Mkt-RF','SMB','HML','RF']]
    ff_ret = ff_ret.rename(columns = {"Mkt-RF": "mkt_ret"})
    # converting the factor exposures to percentage
    ff_ret["mkt_ret"] = ff_ret["mkt_ret"]/100
    ff_ret["SMB"] = ff_ret["SMB"]/100
    ff_ret["HML"] = ff_ret["HML"]/100
    ff_ret["RF"] = ff_ret["RF"]/100
    ff_ret["YearMonth"] = ff_ret["YearMonth"].astype(str) #converting YearMonth to string
    
    return ff_ret 

### Loop over six years window
def rolling_time_window(Start_Month, Start_Year, Time_Period_Training, Time_Period_Testing, Gap, data):

In [82]:
'''
Aurthor: Zhikang Wang / Yulin Chen
Loop over the period from 1986 to generate rolling regression
'''
mkt_winner_output = []
SMB_winner_output = []
HML_winner_output = []
mkt_loser_output = []
SMB_winner_output = []
HML_winner_output = []

start = 1986
for i in range(1,3):
    if (i%12) != 0:
        month = i%12
        year = i // 12 + start
    else: 
        month = 12
        year = i // 12 + start -1
    ff_ret = format_FF()
    train,test = rolling_time_window(month, year, 36, 1, 0, crsp)
    train = get_monthly_ret_dsf(train)
    train = get_weight(train)
    train = train.rename(columns = {"ret": "ticker_ret"})
    # Merging the two dataset into one combined return dataset, which will be used as regression input
    combined_ret = pd.merge(ff_ret, train, how='right', on=['YearMonth'])
    # calculates the excess return of tickers by subtracting the returns by risk free rate
    combined_ret['ticker_excess_ret'] = combined_ret['ticker_ret'] - combined_ret['RF']
    mkt_winner,SMB_winner,HML_winner,mkt_loser,SMB_winner,HML_winner = all_regression(combined_ret)
    
    mkt_winner_output.append(mkt_winner)
    SMB_winner_output.append(SMB_winner)
    HML_winner_output.append(HML_winner)
    mkt_loser_output.append(mkt_loser)
    SMB_winner_output.append(SMB_winner)
    HML_winner_output.append(HML_winner)
    

In [88]:
temp = pd.DataFrame(dic).T

In [90]:
dic

{'22680': array([ 1.38229092,  0.81682641, -0.41785007]),
 '37197': array([ 0.76290042, -0.61521479, -1.31481328]),
 '13056': array([1.23183184, 0.31376275, 0.43213889]),
 '15966': array([ 0.97599983, -0.07472138,  0.90561557]),
 '47706': array([0.83296346, 1.21273462, 0.1980477 ]),
 '55511': array([ 0.79470932, -0.90223241,  1.13928125]),
 '15684': array([1.37708238, 0.62948814, 0.51755717]),
 '68304': array([ 1.94036917, -0.11143479,  0.34701493]),
 '11691': array([ 0.95597734,  1.00541667, -0.22271264]),
 '45866': array([ 1.03189154,  1.52586665, -0.90180901]),
 '32563': array([ 0.91756663,  1.33327413, -0.68735052]),
 '84073': array([0.82060338, 1.93477938, 1.84031981]),
 '58334': array([ 0.72825009, -0.68337552,  0.68628551]),
 '81294': array([ 0.68324528,  0.19151826, -1.1084069 ]),
 '44194': array([ 1.17033446, -0.16002923, -2.0276569 ]),
 '71116': array([ 0.99269686, -0.15509063, -0.17421645]),
 '22250': array([1.0880089 , 0.71297971, 0.58739205]),
 '41179': array([1.29986544, 