<Header> This is is where we develop a baseline factor model that allows us to take certain factors and use them in multi-OLS regression to predict our excess return signals on a security level. These signals are absolute. </Header>

In [1]:
#First, import our packages for the database connection and dataframe access
import mysql.connector
import pandas as pd
import statsmodels.api as sm
import numpy as np
import warnings
from datetime import datetime, timedelta

In [2]:
#Ignore warnings and set max row display option
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 300)

In [3]:
# Connect to MySQL database
connection = mysql.connector.connect(
    host="ubctg.con7266gcvin.us-east-2.rds.amazonaws.com",
    user="admin",
    password="ubctgquant",
    database="ubctg"
)

Here, we pull monthly returns across the stock universe over 10 years from 2011 to 2021

In [4]:
# Create a cursor object to execute SQL queries
cursor = connection.cursor()

# Define the start and end dates
start_date = '2017-12-01'
end_date = '2021-01-31'

# SQL query to retrieve data from the "Volatility" table between two dates
sql_query = f"SELECT * FROM `Monthly Returns` mr WHERE Date BETWEEN '{start_date}' AND '{end_date}'"

# Execute the SQL query
cursor.execute(sql_query)

# Fetch all rows from the result set
universe_data = cursor.fetchall()

# Convert fetched data into a pandas DataFrame
columns = [i[0] for i in cursor.description]  # Extract column names from cursor description

#create new df
universe_df = pd.DataFrame(universe_data, columns=columns)

#close cursor and db connection
cursor.close()

True

Generate the number of observations per security in our table. This way we can remove those that do not have a full dataset

In [5]:
# Create a cursor object to execute SQL queries
cursor = connection.cursor()

# Define the start and end dates
start_date = '2017-12-01'
end_date = '2021-01-31'

# SQL query to retrieve data from the "Volatility" table between two dates
sql_query = f"SELECT COUNT(PERMNO), PERMNO FROM `Monthly Returns` mr WHERE Date BETWEEN '{start_date}' AND '{end_date}' GROUP BY PERMNO"

# Execute the SQL query
cursor.execute(sql_query)

# Fetch all rows from the result set
observationTable_data = cursor.fetchall()

# Convert fetched data into a pandas DataFrame
columns = [i[0] for i in cursor.description]  # Extract column names from cursor description

#create new df
observation_df = pd.DataFrame(observationTable_data, columns=columns)

#close cursor and db connection
cursor.close()

True

Here, we remove securities that do not have enough observations (in our case, we look for at least 120)

In [6]:
#Filter securities with at least 120 observations in the period
observation_df_filtered = observation_df[pd.to_numeric(observation_df['COUNT(PERMNO)']) >=38]

#Inner join our dataframes to only keep the securities we have data on
universe_df_filtered = pd.merge(universe_df, observation_df_filtered, on='PERMNO', how='inner')

We can now add a few factors to our model, in this case we will use GDP, CPI (inflation data), and the unemployment rate in each month. We then add it to our monthly returns dataframe for our OLS regression

In [7]:
#Introduce our factor data below as a CSV (pandas datareader not currently working for FRED api)
macro_factors = pd.read_csv("UBCTG Factor Model Example - Macro FRED Data.csv")

#These dates are at the beginning of month, so we will operate on our monthly return dataframe to convert our dates to the beginning of the month so that we can append on index
universe_df_filtered["date"] = pd.to_datetime(universe_df_filtered["date"]).dt.to_period('M').dt.to_timestamp()

#Convert macro factors date column to type datetime64 and drop the non-datetime64 column
macro_factors["date"] = pd.to_datetime(macro_factors["DATE"])
macro_factors = macro_factors.drop('DATE', axis=1)

#Inner-join macro factors dataframe with universe dataframe, using 'date' column as index
universe_df_with_external_factors = pd.merge(universe_df_filtered, macro_factors, on= 'date', how='inner')

#Ensure no errors in the returns column (there have been some instances where returns have taken on non-numeric values)
universe_df_with_external_factors["RET"] = pd.to_numeric(universe_df_with_external_factors["RET"], errors="coerce")

In [8]:
#Create a blank row at the end of our dataframe to accomodate shifted data
universe_df_with_external_factors_elongated = pd.concat([universe_df_with_external_factors, pd.DataFrame([[np.nan] * universe_df_with_external_factors.shape[1]], columns=universe_df_with_external_factors.columns)], ignore_index=True)

#Move into factor model
#Shift in -1 direction, rename to shifted return or similar
#Shift our returns data forward by one period. This way, we regress "t" factors to "t+1" returns, and our betas become forecasts
universe_df_with_external_factors_elongated['RET'] = universe_df_with_external_factors_elongated['RET'].shift(-1)

#We want to remove our minimum date so that we have an equal number of periods across securities. Here we find the minimum date and remove rows with that date from our dataframe
dtrmval = np.min(universe_df_with_external_factors_elongated.date)
dtrmvalrangemax = dtrmval + timedelta(days=1)

#Filter out our beginning date
universe_df_with_external_factors_filtered = universe_df_with_external_factors_elongated[(universe_df_with_external_factors_elongated["date"] >= dtrmvalrangemax)]
#colnames = universe_df_with_external_factors_filtered.columns[4:6]
#print(colnames)

#display(universe_df_with_external_factors_filtered.iloc[:,3:])

#Here, we select our required columns in proper order (see function documentation for more info) and get rid of extra columns that we will not use
universe_df_with_external_factors_filtered = universe_df_with_external_factors_filtered[["PERMNO","date","RET", "Annualized Percent Change of GDP from Preceding Period, Seasonally Adjusted","CPI (USACPALTT01CTGYM)","UNRATE"]]
display(universe_df_with_external_factors_filtered)

Unnamed: 0,PERMNO,date,RET,"Annualized Percent Change of GDP from Preceding Period, Seasonally Adjusted",CPI (USACPALTT01CTGYM),UNRATE
1,10026.0,2018-01-01,0.007743,2.590361,2.042188,4.0
2,10026.0,2018-02-01,-0.088191,3.685589,2.175117,4.1
3,10026.0,2018-03-01,-0.029688,1.852684,2.321362,4.0
4,10026.0,2018-04-01,0.019951,1.741385,2.406737,4.0
5,10026.0,2018-05-01,0.006224,1.782396,2.725657,3.8
...,...,...,...,...,...,...
234583,93436.0,2020-09-01,0.741452,10.259221,1.332004,7.8
234584,93436.0,2020-10-01,-0.139087,-3.677010,1.130982,6.8
234585,93436.0,2020-11-01,-0.095499,-4.401027,1.122155,6.7
234586,93436.0,2020-12-01,0.462736,3.579485,1.323012,6.7


In [9]:
#We generate our list of unique tickers, this can be done in multiple different ways (in this case, using a previous df with distinct tickers)
#listofPERMNO = observation_df[pd.to_numeric(observation_df['COUNT(PERMNO)']) >=37]
#uniqueTickerList = listofPERMNO['PERMNO']
#print(uniqueTickerList)
#uniquetickerlist1 = universe_df_with_external_factors_filtered.iloc[:,0].unique()

#universe_df_with_external_factors_filtered.columns
#universe_df_with_external_factors_filtered.iloc[:,0]
#print(universe_df_with_external_factors_filtered.iloc[:,0].unique())


In [9]:
#Defining the factor model that takes a dataframe of the required columns (unique identifier, date/index column, returns columns, and a set of factors)
def olsfactormodel(df_attached, lookbackwindow:int):

    #Assign passed dataframe to new dataframe
    df = df_attached.copy(deep=True)
    
    #We generate our list of unique tickers using the column in the unique identifier position (0, or leftmost column)
    uniqueTickerList = df.iloc[:,0].unique()
    
    #Initialize large df to drop results of regression for each security at each regression date
    containerdf = pd.DataFrame()
                
    #Set our lookback window to 24 periods (months in this case). This means that we will run a regression for each period (after the first 24 months) using the previous 24 months as data
    LookBack_Window=lookbackwindow
    
    #We will now create a new set of columns for the Beta and P-value for each of our factors. We will do this by iterating through each factor and columns for the beta coefficients
    factorlist = list(df.columns[3:])
    print(len(factorlist))
    
    #For each factor, define an empty column to hold the corresponding coefficient
    for factor in factorlist: 
        df["Beta_" + factor] = 0
    
    #Initialize global parameter(s) regardless of factor count
    df["R_squared"] = 0
    df["Constant B0"] = 0

    #For each identifier (ticker), generate a dataframe from the broader dataframe that 
    for ticker in uniqueTickerList:
        
        #For each unique identifier (ticker), we create a dataframe with observations from that particular identifier
        ticker_specific_universe_df = df.loc[df.iloc[:,0] == ticker]
        
        #We then sort our date/index column in position 1 to ensure our date is ascending from the earliest available
        ticker_specific_universe_df = ticker_specific_universe_df.sort_values(by= ticker_specific_universe_df.columns[1])

        #For each lookback window span, train an OLS and collect the results
        for x in range(0, len(ticker_specific_universe_df)-LookBack_Window):
            
            # Define the independent variables (X) and dependent variable (Y). X's are defined by our factor columns and our returns are defined in our third left-most column (position 2)
            X = ticker_specific_universe_df[factorlist][x:x+LookBack_Window]
            Y = ticker_specific_universe_df[ticker_specific_universe_df.columns[2]][x:x+LookBack_Window]
    
            #Add a constant term to the independent variables, check impact
            X = sm.add_constant(X)
        
            #Fit the linear regression model
            model = sm.OLS(Y, X)
            results = model.fit()
            results.params
            
            #Place our regression coefficients into their appropriate columns
            ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+1+LookBack_Window], "Constant B0"] = results.params[0]
            ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+1+LookBack_Window], "R_squared"] = results.rsquared
    
            #We do the same dynamically using our list of factors and placing into the appropriate factor column
    
            #Define an index (starting position for inserting regression params)
            i = 3 + len(factorlist) - 1
            
            for index in range(len(factorlist)):
                #increase our indices by one for each factor
                index+=1
                i+=1
                
                ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+1+LookBack_Window], ticker_specific_universe_df.columns[i]] = results.params[index]
        
         
        #Add ticker dataframe to larger container dataframe
        containerdf = pd.concat([containerdf, ticker_specific_universe_df], ignore_index=True)   
    return containerdf

In [10]:
returnsdf = olsfactormodel(universe_df_with_external_factors_filtered,35)

3


In [11]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 500)
display(returnsdf[(returnsdf != 0).all(1)])

Unnamed: 0,PERMNO,date,RET,"Annualized Percent Change of GDP from Preceding Period, Seasonally Adjusted",CPI (USACPALTT01CTGYM),UNRATE,"Beta_Annualized Percent Change of GDP from Preceding Period, Seasonally Adjusted",Beta_CPI (USACPALTT01CTGYM),Beta_UNRATE,R_squared,Constant B0
36,10026.0,2021-01-01,0.072598,8.270387,1.365916,6.4,0.001592,-0.027823,-1.098803e-02,0.306929,0.106571
73,10028.0,2021-01-01,0.125541,8.270387,1.365916,6.4,0.000635,-0.172604,-2.494255e-02,0.056904,0.517157
110,10032.0,2021-01-01,0.046848,8.270387,1.365916,6.4,0.002033,-0.008256,-2.289406e-04,0.181050,0.020729
147,10044.0,2021-01-01,-0.051522,8.270387,1.365916,6.4,0.001660,-0.044615,-2.511754e-02,0.425997,0.177845
184,10051.0,2021-01-01,-0.030851,8.270387,1.365916,6.4,,,,,
221,10065.0,2021-01-01,0.036571,8.270387,1.365916,6.4,0.001487,0.006392,2.400938e-03,0.247013,-0.016623
258,10066.0,2021-01-01,,8.270387,1.365916,6.4,,,,,
295,10104.0,2021-01-01,0.120755,8.270387,1.365916,6.4,0.000586,-0.003558,2.345004e-03,0.050451,0.000626
332,10107.0,2021-01-01,0.039006,8.270387,1.365916,6.4,0.000941,0.033038,9.319765e-03,0.147206,-0.082286
369,10113.0,2021-01-01,0.054937,8.270387,1.365916,6.4,0.002149,-0.026820,-2.525592e-03,0.363455,0.061271


In [13]:
list(range(0,10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]