# Using SimFin with Linear Regression

In [1]:
# Importing packages

import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

import simfin as sf
from simfin.names import *

pd.set_option('display.max_columns', None)

In [2]:
# Setting the api key and directory for simfin
sf.set_api_key(api_key='free')
sf.set_data_dir('~/SimFin/simfin_data/')

In [3]:
# Data for the US.
market = 'us'

# Trailing twelve months (TTM) Income Statements.
df_income_ttm = sf.load_income(variant='ttm', market=market)

# Quarterly Income Statements.
df_income_qrt = sf.load_income(variant='quarterly', market=market)

# TTM Balance Sheets.
df_balance_ttm = sf.load_balance(variant='ttm', market=market)

# Quarterly Balance Sheets.
df_balance_qrt = sf.load_balance(variant='quarterly', market=market)

# TTM Cash-Flow Statements.
df_cashflow_ttm = sf.load_cashflow(variant='ttm', market=market)

# Quarterly Cash-Flow Statements.
df_cashflow_qrt = sf.load_cashflow(variant='quarterly', market=market)

# Daily Share-Prices.
df_prices = sf.load_shareprices(variant='daily', market=market)

Dataset "us-income-ttm" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-income-quarterly" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-balance-ttm" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-balance-quarterly" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-ttm" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-cashflow-quarterly" on disk (7 days old).
- Loading from disk ... Done!
Dataset "us-shareprices-daily" on disk (7 days old).
- Loading from disk ... Done!


In [4]:
# Net profit margin
df_npm = df_income_ttm[NET_INCOME] / df_income_ttm[REVENUE]
df_npm

Ticker  Report Date
 AVLR   2018-06-30    -0.275378
        2018-09-30    -0.314095
        2018-12-31    -0.277657
        2019-03-31    -0.235191
        2019-06-30    -0.201018
                         ...   
low     2018-08-31     0.055792
        2018-11-30     0.051871
        2019-02-28     0.032450
        2019-05-31     0.033087
        2019-08-31     0.035212
Length: 55475, dtype: float64

In [5]:
# Function for calculating growth for a single company.
sales_growth = lambda x: x / x.shift(4) - 1

# Split the DataFrame into sub-groups for the tickers,
# then apply the sales-growth function to each group,
# and finally put the results back together.
df_growth = df_income_qrt[REVENUE].groupby(TICKER).apply(sales_growth)

# Show the result.
df_growth

Ticker  Report Date
 AVLR   2018-06-30          NaN
        2018-09-30          NaN
        2018-12-31          NaN
        2019-03-31          NaN
        2019-06-30     0.429249
                         ...   
low     2018-08-31     0.071454
        2018-11-30     0.038462
        2019-02-28     0.009810
        2019-05-31     0.021947
        2019-08-31     0.004979
Name: Revenue, Length: 53846, dtype: float64

In [6]:
# Calculate 1-year sales-growth.
df_growth = sf.rel_change(df=df_income_qrt[REVENUE], freq='q',
                          years=1, future=False)

# Show the result.
df_growth

Ticker  Report Date
 AVLR   2018-06-30          NaN
        2018-09-30          NaN
        2018-12-31          NaN
        2019-03-31          NaN
        2019-06-30     0.429249
                         ...   
low     2018-08-31     0.071454
        2018-11-30     0.038462
        2019-02-28     0.009810
        2019-05-31     0.021947
        2019-08-31     0.004979
Name: Revenue, Length: 53846, dtype: float64

In [7]:
# Data from Income Statements.
df1 = df_income_ttm[NET_INCOME]

# Data from Balance Sheets.
df2 = df_balance_ttm[TOTAL_EQUITY]

# Join into a single DataFrame.
df_join = pd.concat([df1, df2], axis=1)

# Show the result.
df_join

Unnamed: 0_level_0,Unnamed: 1_level_0,Net Income,Total Equity
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AVLR,2018-06-30,-65694000,1.421540e+08
AVLR,2018-09-30,-79532000,1.222490e+08
AVLR,2018-12-31,-75550000,1.095530e+08
AVLR,2019-03-31,-69544000,1.689730e+08
AVLR,2019-06-30,-64951000,4.521280e+08
...,...,...,...
low,2018-08-31,3934000000,5.781000e+09
low,2018-11-30,3691000000,5.394000e+09
low,2019-02-28,2314000000,3.644000e+09
low,2019-05-31,2372000000,3.236000e+09


In [8]:
# Function for calculating Return on Equity (ROE) for a single company.
roe = lambda df_grp: df_grp[NET_INCOME] / df_grp[TOTAL_EQUITY].shift(4)

# Split the DataFrame into sub-groups for the tickers,
# then apply the ROE function to each group,
# and finally glue the results back together.
df_roe = df_join.groupby(TICKER, group_keys=False).apply(roe)

# Show the result.
df_roe

Ticker  Report Date
 AVLR   2018-06-30          NaN
        2018-09-30          NaN
        2018-12-31          NaN
        2019-03-31          NaN
        2019-06-30    -0.456906
                         ...   
low     2018-08-31     0.710621
        2018-11-30     0.642807
        2019-02-28     0.394006
        2019-05-31     0.412881
        2019-08-31     0.437295
Length: 55475, dtype: float64

In [9]:
# Data from Income Statements.
df1 = df_income_ttm[[NET_INCOME, REVENUE]]

# Data from Balance Sheets.
df2 = df_balance_ttm[[TOTAL_ASSETS, TOTAL_EQUITY]]

# Combine the data into a single DataFrame.
df_join = pd.concat([df1, df2], axis=1)

In [10]:
def signals(df):
    """
    Calculate financial signals for a stock.
    
    df is a DataFrame with required data from
        Income Statements, Balance Sheets, etc.
        Assumed to be TTM-data.
        
    """
    
    # Create new DataFrame for the signals.
    df_signals = pd.DataFrame(index=df.index)

    # Net Profit Margin.
    df_signals[NET_PROFIT_MARGIN] = df[NET_INCOME] / df[REVENUE]
    
    # Return on Assets.
    df_signals[ROA] = df[NET_INCOME] / df[TOTAL_ASSETS].shift(4)
    
    # Return on Equity.
    df_signals[ROE] = df[NET_INCOME] / df[TOTAL_EQUITY].shift(4)

    return df_signals

In [11]:
df_fin_signals = sf.apply(df=df_join, func=signals)
df_fin_signals

Unnamed: 0_level_0,Unnamed: 1_level_0,Net Profit Margin,Return on Assets,Return on Equity
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AVLR,2018-06-30,-0.275378,,
AVLR,2018-09-30,-0.314095,,
AVLR,2018-12-31,-0.277657,,
AVLR,2019-03-31,-0.235191,,
AVLR,2019-06-30,-0.201018,-0.184146,-0.456906
...,...,...,...,...
low,2018-08-31,0.055792,0.107287,0.710621
low,2018-11-30,0.051871,0.100345,0.642807
low,2019-02-28,0.032450,0.065569,0.394006
low,2019-05-31,0.033087,0.062034,0.412881


In [12]:
# Add time-lag of 30 days.
df_fin_signals2 = sf.add_date_offset(df=df_fin_signals,
                                     date_index=REPORT_DATE,
                                     offset=pd.DateOffset(days=30))

# Show the result.
df_fin_signals2

Unnamed: 0_level_0,Unnamed: 1_level_0,Net Profit Margin,Return on Assets,Return on Equity
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AVLR,2018-07-30,-0.275378,,
AVLR,2018-10-30,-0.314095,,
AVLR,2019-01-30,-0.277657,,
AVLR,2019-04-30,-0.235191,,
AVLR,2019-07-30,-0.201018,-0.184146,-0.456906
...,...,...,...,...
low,2018-09-30,0.055792,0.107287,0.710621
low,2018-12-30,0.051871,0.100345,0.642807
low,2019-03-30,0.032450,0.065569,0.394006
low,2019-06-30,0.033087,0.062034,0.412881


In [13]:
# Reindex financial signals to the same days as share-price data.
df_fin_signals_daily = sf.reindex(df_src=df_fin_signals,
                                  df_target=df_prices,
                                  method='ffill')

# Show the result.
df_fin_signals_daily.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Net Profit Margin,Return on Assets,Return on Equity
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AVLR,2019-07-01,-0.201018,-0.184146,-0.456906
AVLR,2019-07-02,-0.201018,-0.184146,-0.456906
AVLR,2019-07-03,-0.201018,-0.184146,-0.456906
AVLR,2019-07-05,-0.201018,-0.184146,-0.456906
AVLR,2019-07-08,-0.201018,-0.184146,-0.456906
...,...,...,...,...
low,2019-09-05,0.035212,0.068131,0.437295
low,2019-09-06,0.035212,0.068131,0.437295
low,2019-09-09,0.035212,0.068131,0.437295
low,2019-09-10,0.035212,0.068131,0.437295


In [14]:
df_fin_signals = sf.fin_signals(df_income_ttm=df_income_ttm,
                                df_balance_ttm=df_balance_ttm,
                                df_cashflow_ttm=df_cashflow_ttm)

df_fin_signals.dropna().head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Debt Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Inventory Turnover,Log Revenue,Net Acquisitions / Total Assets,Net Profit Margin,Quick Ratio,R&D / Gross Profit,R&D / Revenue,Return on Assets,Return on Equity,Return on Research Capital,Share Buyback / FCF
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,2010-07-31,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492
A,2010-10-31,0.18543,0.561469,0.564356,2.000973,0.380672,-0.0,0.538207,7.447368,7.603352,9.735918,0.114274,0.125643,1.141096,0.208874,0.112417,0.070545,0.211372,4.787582,0.18543
A,2011-01-31,0.365639,0.714818,0.561947,3.27027,0.265912,-0.0,0.536696,9.106667,7.214555,9.759668,0.136251,0.138783,2.483642,0.201555,0.108174,0.099204,0.238994,4.961415,0.365639
A,2011-04-30,0.168927,0.711759,0.590551,3.201005,0.24789,-0.0,0.532001,10.890411,7.216882,9.789299,0.140132,0.144574,2.445352,0.194504,0.103476,0.102902,0.224691,5.141287,0.168927
A,2011-07-31,0.18595,0.738375,0.603113,3.470432,0.247687,-0.0,0.53257,13.927536,7.205128,9.810434,0.012567,0.157048,2.669767,0.187391,0.099799,0.11596,0.241667,5.336434,0.18595


In [15]:
df_fin_signals = sf.fin_signals(df_prices=df_prices,
                                df_income_ttm=df_income_ttm,
                                df_balance_ttm=df_balance_ttm,
                                df_cashflow_ttm=df_cashflow_ttm,
                                fill_method='ffill')

df_fin_signals.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Debt Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Inventory Turnover,Log Revenue,Net Acquisitions / Total Assets,Net Profit Margin,Quick Ratio,R&D / Gross Profit,R&D / Revenue,Return on Assets,Return on Equity,Return on Research Capital,Share Buyback / FCF
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,2010-08-02,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492
A,2010-08-03,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492
A,2010-08-04,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492
A,2010-08-05,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492
A,2010-08-06,0.171492,0.553297,0.622857,1.966061,0.404176,-0.0,0.542205,5.636364,7.318314,9.701999,0.115385,0.08282,1.065135,0.220879,0.119762,0.045824,0.148399,4.527363,0.171492


In [16]:
df_fin_signals_2y = sf.fin_signals(df_prices=df_prices,
                                   df_income_ttm=df_income_ttm,
                                   df_balance_ttm=df_balance_ttm,
                                   df_cashflow_ttm=df_cashflow_ttm,
                                   func=sf.avg_ttm_2y,
                                   fill_method='ffill')

df_fin_signals_2y.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,(Dividends + Share Buyback) / FCF,Asset Turnover,CapEx / (Depr + Amor),Current Ratio,Debt Ratio,Dividends / FCF,Gross Profit Margin,Interest Coverage,Inventory Turnover,Log Revenue,Net Acquisitions / Total Assets,Net Profit Margin,Quick Ratio,R&D / Gross Profit,R&D / Revenue,Return on Assets,Return on Equity,Return on Research Capital,Share Buyback / FCF
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,2011-08-01,0.178721,0.645836,0.612985,2.718246,0.325931,-0.0,0.537387,9.78195,7.261721,9.756217,0.063976,0.119934,1.867451,0.204135,0.10978,0.080892,0.195033,4.931899,0.178721
A,2011-08-02,0.178721,0.645836,0.612985,2.718246,0.325931,-0.0,0.537387,9.78195,7.261721,9.756217,0.063976,0.119934,1.867451,0.204135,0.10978,0.080892,0.195033,4.931899,0.178721
A,2011-08-03,0.178721,0.645836,0.612985,2.718246,0.325931,-0.0,0.537387,9.78195,7.261721,9.756217,0.063976,0.119934,1.867451,0.204135,0.10978,0.080892,0.195033,4.931899,0.178721
A,2011-08-04,0.178721,0.645836,0.612985,2.718246,0.325931,-0.0,0.537387,9.78195,7.261721,9.756217,0.063976,0.119934,1.867451,0.204135,0.10978,0.080892,0.195033,4.931899,0.178721
A,2011-08-05,0.178721,0.645836,0.612985,2.718246,0.325931,-0.0,0.537387,9.78195,7.261721,9.756217,0.063976,0.119934,1.867451,0.204135,0.10978,0.080892,0.195033,4.931899,0.178721


In [17]:
# Select the data-columns to calculate growth rates for.
df = df_income_ttm[[REVENUE, NET_INCOME]]

# Dict mapping to the new column-names.
new_names = {REVENUE: SALES_GROWTH,
             NET_INCOME: EARNINGS_GROWTH}

# Calculate the growth-rates.
df_growth = sf.rel_change(df=df, freq='q', quarters=4,
                          future=False, annualized=False,
                          new_names=new_names)

# Show the result.
df_growth.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales Growth,Earnings Growth
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AVLR,2018-06-30,,
AVLR,2018-09-30,,
AVLR,2018-12-31,,
AVLR,2019-03-31,,
AVLR,2019-06-30,0.354428,-0.01131


In [18]:
# Reindex the growth-rates to the same dates as the share prices.
df_growth_daily = sf.reindex(df_src=df_growth,
                             df_target=df_prices, method='ffill')

# Show the result.
df_growth_daily.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales Growth,Earnings Growth
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AVLR,2019-07-01,0.354428,-0.01131
AVLR,2019-07-02,0.354428,-0.01131
AVLR,2019-07-03,0.354428,-0.01131
AVLR,2019-07-05,0.354428,-0.01131
AVLR,2019-07-08,0.354428,-0.01131


In [19]:
df_growth_signals = sf.growth_signals(df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_balance_ttm=df_balance_ttm,
                      df_balance_qrt=df_balance_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt)

df_growth_signals.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Assets Growth,Assets Growth QOQ,Assets Growth YOY,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,FCF Growth QOQ,FCF Growth YOY,Sales Growth,Sales Growth QOQ,Sales Growth YOY
Ticker,Report Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AVLR,2019-06-30,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249
A,2011-07-31,-0.038132,0.012025,-0.038132,1.434053,0.65,0.609756,1.155902,-0.327217,2.492063,0.283615,0.008348,0.221821
A,2011-10-31,-0.065903,0.034731,-0.065903,0.479532,-0.124242,-0.010274,0.804636,1.095455,0.359882,0.215099,0.021881,0.096447
A,2012-01-31,0.131154,0.004637,0.131154,0.314536,-0.204152,0.19171,0.632893,-0.774403,0.268293,0.170609,-0.053819,0.076366
A,2012-04-30,0.088334,0.034509,0.088334,0.240449,0.108696,0.275,0.357583,2.038462,-0.033639,0.102502,0.059939,0.033393


In [20]:
df_growth_signals = sf.growth_signals(df_prices=df_prices,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_balance_ttm=df_balance_ttm,
                      df_balance_qrt=df_balance_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill')

df_growth_signals.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Assets Growth,Assets Growth QOQ,Assets Growth YOY,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,FCF Growth QOQ,FCF Growth YOY,Sales Growth,Sales Growth QOQ,Sales Growth YOY
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AVLR,2019-07-01,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249
AVLR,2019-07-02,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249
AVLR,2019-07-03,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249
AVLR,2019-07-05,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249
AVLR,2019-07-08,0.963194,0.763363,0.963194,-0.01131,0.425187,-0.258528,-0.813296,-1.57152,-3.930061,0.354428,0.074485,0.429249


In [21]:
df_growth_signals_2y = sf.growth_signals(df_prices=df_prices,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_balance_ttm=df_balance_ttm,
                      df_balance_qrt=df_balance_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt,
                      fill_method='ffill',
                      func=sf.avg_ttm_2y)

df_growth_signals_2y.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Assets Growth,Assets Growth QOQ,Assets Growth YOY,Earnings Growth,Earnings Growth QOQ,Earnings Growth YOY,FCF Growth,FCF Growth QOQ,FCF Growth YOY,Sales Growth,Sales Growth QOQ,Sales Growth YOY
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A,2012-07-31,0.038286,0.024285,0.038286,0.718012,0.301471,0.17306,0.63167,-0.361393,1.180123,0.169349,0.001289,0.120372
A,2012-08-01,0.038286,0.024285,0.038286,0.718012,0.301471,0.17306,0.63167,-0.361393,1.180123,0.169349,0.001289,0.120372
A,2012-08-02,0.038286,0.024285,0.038286,0.718012,0.301471,0.17306,0.63167,-0.361393,1.180123,0.169349,0.001289,0.120372
A,2012-08-03,0.038286,0.024285,0.038286,0.718012,0.301471,0.17306,0.63167,-0.361393,1.180123,0.169349,0.001289,0.120372
A,2012-08-06,0.038286,0.024285,0.038286,0.718012,0.301471,0.17306,0.63167,-0.361393,1.180123,0.169349,0.001289,0.120372


In [22]:
df_sales_per_share = df_income_ttm[REVENUE].div(df_income_ttm[SHARES_DILUTED], axis=0)
df_sales_per_share

Ticker  Report Date
 AVLR   2018-06-30     29.640181
        2018-09-30     10.881626
        2018-12-31      7.032410
        2019-03-31      5.451053
        2019-06-30      4.714127
                         ...    
low     2018-08-31     85.469091
        2018-11-30     86.909313
        2019-02-28     87.818966
        2019-05-31     89.083566
        2019-08-31     90.136849
Length: 55475, dtype: float64

In [23]:
df_sps_daily = sf.reindex(df_src=df_sales_per_share,
                          df_target=df_prices, method='ffill')

df_sps_daily.dropna()

Ticker  Date      
 AVLR   2018-07-02    29.640181
        2018-07-03    29.640181
        2018-07-05    29.640181
        2018-07-06    29.640181
        2018-07-09    29.640181
                        ...    
low     2019-09-05    90.136849
        2019-09-06    90.136849
        2019-09-09    90.136849
        2019-09-10    90.136849
        2019-09-11    90.136849
Length: 3263606, dtype: float64

In [24]:
# Calculate P/Sales ratio.
df_psales = df_prices[CLOSE].div(df_sps_daily, axis=0)

# Rename the data-column.
df_psales.rename(PSALES, inplace=True)

# Show the result.
df_psales.dropna()

Ticker  Date      
 AVLR   2018-07-02    1.643715
        2018-07-03    1.673067
        2018-07-05    1.656535
        2018-07-06    1.598843
        2018-07-09    1.602554
                        ...   
low     2019-09-05    1.267073
        2019-09-06    1.272620
        2019-09-09    1.273508
        2019-09-10    1.278944
        2019-09-11    1.259086
Name: P/Sales, Length: 3259392, dtype: float64

In [25]:
df_earnings_per_share = df_income_ttm[NET_INCOME_COMMON].div(df_income_ttm[SHARES_DILUTED], axis=0)
df_earnings_per_share

Ticker  Report Date
 AVLR   2018-06-30    -8.162266
        2018-09-30    -3.417865
        2018-12-31    -1.952600
        2019-03-31    -1.282041
        2019-06-30    -0.947623
                         ...   
low     2018-08-31     4.768485
        2018-11-30     4.508092
        2019-02-28     2.849754
        2019-05-31     2.947499
        2019-08-31     3.173886
Length: 55475, dtype: float64

In [26]:
df_eps_daily = sf.reindex(df_src=df_earnings_per_share,
                          df_target=df_prices, method='ffill')

df_eps_daily.dropna()

Ticker  Date      
 AVLR   2018-07-02   -8.162266
        2018-07-03   -8.162266
        2018-07-05   -8.162266
        2018-07-06   -8.162266
        2018-07-09   -8.162266
                        ...   
low     2019-09-05    3.173886
        2019-09-06    3.173886
        2019-09-09    3.173886
        2019-09-10    3.173886
        2019-09-11    3.173886
Length: 3313388, dtype: float64

In [27]:
# Calculate the P/E ratio.
df_pe = df_prices[CLOSE] / df_eps_daily

# Rename the data-column.
df_pe.rename(PE, inplace=True)

# Show the result.
df_pe.dropna()

Ticker  Date      
 AVLR   2018-07-02    -5.968930
        2018-07-03    -6.075519
        2018-07-05    -6.015486
        2018-07-06    -5.805986
        2018-07-09    -5.819462
                        ...    
low     2019-09-05    35.984282
        2019-09-06    36.141818
        2019-09-09    36.167023
        2019-09-10    36.321408
        2019-09-11    35.757431
Name: P/E, Length: 3309141, dtype: float64

In [28]:
# Calculate Free Cash Flow.
df_fcf_ttm = df_cashflow_ttm[NET_CASH_OPS] + df_cashflow_ttm[CAPEX]

# Rename the data-column.
df_fcf_ttm.rename(FCF, inplace=True)

Ticker  Report Date
 AVLR   2018-06-30    -2.394700e+07
        2018-09-30    -2.859300e+07
        2018-12-31    -1.854500e+07
        2019-03-31    -1.408000e+07
        2019-06-30    -4.471000e+06
                           ...     
low     2018-08-31     4.653000e+09
        2018-11-30     5.383000e+09
        2019-02-28     5.095000e+09
        2019-05-31     3.841000e+09
        2019-08-31     2.920000e+09
Name: Free Cash Flow, Length: 55475, dtype: float64

In [29]:
df_fcf_per_share = df_fcf_ttm.div(df_income_ttm[SHARES_DILUTED])
df_fcf_per_share

Ticker  Report Date
 AVLR   2018-06-30    -2.975337
        2018-09-30    -1.228776
        2018-12-31    -0.479298
        2019-03-31    -0.259564
        2019-06-30    -0.065231
                         ...   
low     2018-08-31     5.640000
        2018-11-30     6.574656
        2019-02-28     6.274631
        2019-05-31     4.772911
        2019-08-31     3.666039
Length: 55475, dtype: float64

In [30]:
df_fcf_daily = sf.reindex(df_src=df_fcf_per_share,
                          df_target=df_prices, method='ffill')

df_fcf_daily.dropna()

Ticker  Date      
 AVLR   2018-07-02   -2.975337
        2018-07-03   -2.975337
        2018-07-05   -2.975337
        2018-07-06   -2.975337
        2018-07-09   -2.975337
                        ...   
low     2019-09-05    3.666039
        2019-09-06    3.666039
        2019-09-09    3.666039
        2019-09-10    3.666039
        2019-09-11    3.666039
Length: 3245949, dtype: float64

In [31]:
# Calculate the P/FCF ratio.
df_pfcf = df_prices[CLOSE] / df_fcf_daily

# Rename the data-column.
df_pfcf.rename(PFCF, inplace=True)

# Show the result.
df_pfcf.dropna()

Ticker  Date      
 AVLR   2018-07-02   -16.374616
        2018-07-03   -16.667019
        2018-07-05   -16.502332
        2018-07-06   -15.927607
        2018-07-09   -15.964578
                        ...    
low     2019-09-05    31.153515
        2019-09-06    31.289902
        2019-09-09    31.311724
        2019-09-10    31.445384
        2019-09-11    30.957118
Name: P/FCF, Length: 3243487, dtype: float64

In [32]:
avg_ttm_2y = lambda x: 0.5 * (x + x.shift(4))

In [33]:
df_earnings_2y = sf.apply(df=df_income_ttm[NET_INCOME_COMMON],
                          func=avg_ttm_2y)

df_earnings_2y

Ticker  Report Date
 AVLR   2018-06-30              NaN
        2018-09-30              NaN
        2018-12-31              NaN
        2019-03-31              NaN
        2019-06-30    -6.532250e+07
                           ...     
low     2018-08-31     3.498500e+09
        2018-11-30     3.623500e+09
        2019-02-28     2.880500e+09
        2019-05-31     3.102500e+09
        2019-08-31     3.231000e+09
Name: Net Income (Common), Length: 55475, dtype: float64

In [34]:
df_eps_2y = df_earnings_2y.div(df_income_ttm[SHARES_DILUTED])
df_eps_2y.dropna()

Ticker  Report Date
 AVLR   2019-06-30    -0.953043
A       2011-07-31     2.012649
        2011-10-31     2.388732
        2012-01-31     2.606916
        2012-04-30     2.816384
                         ...   
low     2018-08-31     4.240606
        2018-11-30     4.425649
        2019-02-28     3.547414
        2019-05-31     3.855235
        2019-08-31     4.056497
Length: 47083, dtype: float64

In [35]:
df_eps_2y_daily = sf.reindex(df_src=df_eps_2y,
                             df_target=df_prices, method='ffill')

df_eps_2y_daily.dropna()

Ticker  Date      
 AVLR   2019-07-01   -0.953043
        2019-07-02   -0.953043
        2019-07-03   -0.953043
        2019-07-05   -0.953043
        2019-07-08   -0.953043
                        ...   
low     2019-09-05    4.056497
        2019-09-06    4.056497
        2019-09-09    4.056497
        2019-09-10    4.056497
        2019-09-11    4.056497
Length: 2846333, dtype: float64

In [36]:
# Calculate the P/E ratios using 2-year earnings average.
df_pe_2y = df_prices[CLOSE].div(df_eps_2y_daily, axis=0)

# Rename the data-column.
PE_2Y = 'P/E (2Y Avg. Earnings)'
df_pe_2y.rename(PE_2Y, inplace=True)

# Show the result.
df_pe_2y.dropna()

Ticker  Date      
 AVLR   2019-07-01   -79.293404
        2019-07-02   -81.969045
        2019-07-03   -83.375068
        2019-07-05   -83.479995
        2019-07-08   -83.836747
                        ...    
low     2019-09-05    28.154833
        2019-09-06    28.278092
        2019-09-09    28.297813
        2019-09-10    28.418607
        2019-09-11    27.977340
Name: P/E (2Y Avg. Earnings), Length: 2843339, dtype: float64

In [37]:
# Calculate valuation signals for all stocks in the DataFrames.

def val_signals(df_prices, df_income_ttm, df_cashflow_ttm,
                shares_index=SHARES_DILUTED):


    # Create a DataFrame with the financial data we need.
    # Start by copying data from the Income Statements.
    df = df_income_ttm[[REVENUE, NET_INCOME_COMMON]].copy()

    # Calculate FCF and add it as a new column to the DataFrame.
    df[FCF] = df_cashflow_ttm[NET_CASH_OPS] + df_cashflow_ttm[CAPEX]

    # Calculate Per-Share numbers.
    df_per_share = df.div(df_income_ttm[shares_index], axis=0)
    
    # Reindex the per-share financial data to daily data-points.
    df_daily = sf.reindex(df_src=df_per_share,
                          df_target=df_prices,
                          method='ffill')
    
    # Create new DataFrame for the signals.
    # Setting the index improves performance.
    df_signals = pd.DataFrame(index=df_prices.index)
    
    # Use the closing share-price for all these signals.
    df_price = df_prices[CLOSE]
    
    # P/Sales ratio.
    df_signals[PSALES] = df_price / df_daily[REVENUE]
    
    # P/E ratio.
    df_signals[PE] = df_price / df_daily[NET_INCOME_COMMON]

    # P/FCF ratio.
    df_signals[PFCF] = df_price / df_daily[FCF]

    return df_signals

In [38]:
df_val_signals = val_signals(df_prices=df_prices,
                             df_income_ttm=df_income_ttm,
                             df_cashflow_ttm=df_cashflow_ttm,
                             shares_index=SHARES_DILUTED)

df_val_signals.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,P/Sales,P/E,P/FCF
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AVLR,2018-07-02,1.643715,-5.968930,-16.374616
AVLR,2018-07-03,1.673067,-6.075519,-16.667019
AVLR,2018-07-05,1.656535,-6.015486,-16.502332
AVLR,2018-07-06,1.598843,-5.805986,-15.927607
AVLR,2018-07-09,1.602554,-5.819462,-15.964578
...,...,...,...,...
low,2019-09-05,1.267073,35.984282,31.153515
low,2019-09-06,1.272620,36.141818,31.289902
low,2019-09-09,1.273508,36.167023,31.311724
low,2019-09-10,1.278944,36.321408,31.445384


In [39]:
df_val_signals = sf.val_signals(df_prices=df_prices,
                                df_income_ttm=df_income_ttm,
                                df_balance_ttm=df_balance_ttm,
                                df_cashflow_ttm=df_cashflow_ttm)

In [40]:
df_val_signals.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,2012-04-30,0.002344,0.073937,0.073736,14931720000.0,3.160152,3.832577,13.525109,13.561962,11.294796,41.105905,2.200047
A,2012-05-01,0.002301,0.072577,0.07238,15211380000.0,3.21934,3.904358,13.778424,13.815967,11.506339,41.875788,2.241252
A,2012-05-02,0.002297,0.072459,0.072262,15236160000.0,3.224584,3.910719,13.80087,13.838474,11.525083,41.944006,2.244903
A,2012-05-03,0.002342,0.073884,0.073683,14942340000.0,3.1624,3.835303,13.534728,13.571608,11.302829,41.135141,2.201612
A,2012-05-04,0.002411,0.076046,0.075839,14517540000.0,3.072495,3.726268,13.149946,13.185777,10.981498,39.965699,2.139022


In [41]:
df_val_signals_3y = sf.val_signals(df_prices=df_prices,
                                   df_income_ttm=df_income_ttm,
                                   df_balance_ttm=df_balance_ttm,
                                   df_cashflow_ttm=df_cashflow_ttm,
                                   func=sf.avg_ttm_3y)

In [42]:
df_val_signals.describe()

Unnamed: 0,Dividend Yield,Earnings Yield,FCF Yield,Market-Cap,P/Book,P/Cash,P/E,P/FCF,P/NCAV,P/NetNet,P/Sales
count,1640178.0,3309269.0,3309269.0,3309269.0,3309269.0,3303671.0,3309269.0,3309269.0,3309269.0,3309269.0,3259520.0
mean,0.02950193,-0.3264031,0.5014761,13125810000.0,inf,inf,inf,inf,inf,243.2624,inf
std,0.07865838,61.57938,124.3441,44369390000.0,,,,,,70420.41,
min,-0.3971592,-16406.25,-111.9372,15360.0,-83705.29,4.173913e-05,-574989.0,-628337.0,-1396542.0,-1231029.0,-1632.841
25%,0.01072337,-0.004245508,-0.001811581,459287400.0,1.412255,5.493496,-0.7693451,-0.7122073,-4.170238,-4.542388,0.8084129
50%,0.01971994,0.03572722,0.03835183,2464902000.0,2.550293,12.20992,16.73799,14.23881,-0.9664306,-1.549746,1.738317
75%,0.03224883,0.0581498,0.06899359,9263450000.0,4.679294,32.02486,27.0376,25.57536,5.024592,1.948333,3.691995
max,9.346682,44.70199,33138.02,1296584000000.0,inf,inf,inf,inf,inf,24711050.0,inf


In [43]:
# Combine the DataFrames.
dfs = [df_fin_signals, df_growth_signals, df_val_signals]
df_signals = pd.concat(dfs, axis=1)

In [44]:
# Remove all rows with only NaN values.
df = df_signals.dropna().reset_index(drop=True)

# For each column, show the fraction of the rows that are NaN.
(df.isnull().sum() / len(df)).sort_values(ascending=False)

P/Sales                              0.0
Net Acquisitions / Total Assets      0.0
Share Buyback / FCF                  0.0
Return on Research Capital           0.0
Return on Equity                     0.0
Return on Assets                     0.0
R&D / Revenue                        0.0
R&D / Gross Profit                   0.0
Quick Ratio                          0.0
Net Profit Margin                    0.0
Log Revenue                          0.0
P/NetNet                             0.0
Inventory Turnover                   0.0
Interest Coverage                    0.0
Gross Profit Margin                  0.0
Dividends / FCF                      0.0
Debt Ratio                           0.0
Current Ratio                        0.0
CapEx / (Depr + Amor)                0.0
Asset Turnover                       0.0
Assets Growth                        0.0
Assets Growth QOQ                    0.0
Assets Growth YOY                    0.0
Earnings Growth                      0.0
P/NCAV          

In [45]:
# List of the columns before removing any.
columns_before = df_signals.columns

# Threshold for the number of rows that must be NaN for each column.
thresh = 0.75 * len(df_signals.dropna(how='all'))

# Remove all columns which don't have sufficient data.
df_signals = df_signals.dropna(axis='columns', thresh=thresh)

# List of the columns after the removal.
columns_after = df_signals.columns

# Show the columns that were removed.
columns_before.difference(columns_after)

Index(['Debt Ratio', 'Dividend Yield', 'Inventory Turnover',
       'Net Acquisitions / Total Assets', 'R&D / Gross Profit',
       'R&D / Revenue', 'Return on Research Capital'],
      dtype='object')

In [46]:
Tickers = []
num = 0

for a_tuple in list(df_prices.index.values):
    num+=1
    if num%50000 == 0:
        print('{}% done'.format(round(100*num/5472812,2)))
    if a_tuple[0] in Tickers:
        pass
    else:
        Tickers.append(a_tuple[0])
print(Tickers)

0.91% done
1.83% done
2.74% done
3.65% done
4.57% done
5.48% done
6.4% done
7.31% done
8.22% done
9.14% done
10.05% done
10.96% done
11.88% done
12.79% done
13.7% done
14.62% done
15.53% done
16.44% done
17.36% done
18.27% done
19.19% done
20.1% done
21.01% done
21.93% done
22.84% done
23.75% done
24.67% done
25.58% done
26.49% done
27.41% done
28.32% done
29.24% done
30.15% done
31.06% done
31.98% done
32.89% done
33.8% done
34.72% done
35.63% done
36.54% done
37.46% done
38.37% done
39.29% done
40.2% done
41.11% done
42.03% done
42.94% done
43.85% done
44.77% done
45.68% done
46.59% done
47.51% done
48.42% done
49.33% done
50.25% done
51.16% done
52.08% done
52.99% done
53.9% done
54.82% done
55.73% done
56.64% done
57.56% done
58.47% done
59.38% done
60.3% done
61.21% done
62.13% done
63.04% done
63.95% done
64.87% done
65.78% done
66.69% done
67.61% done
68.52% done
69.43% done
70.35% done
71.26% done
72.17% done
73.09% done
74.0% done
74.92% done
75.83% done
76.74% done
77.66% don

In [47]:
# Daily Share-Prices.
df_prices = sf.load_shareprices(variant='daily', market=market)
df_prices = df_prices.loc[Tickers, [CLOSE, ADJ_CLOSE]].copy()

Dataset "us-shareprices-daily" on disk (7 days old).
- Loading from disk ... Done!


In [48]:
df_prices

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Adj. Close
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AVLR,2018-06-15,44.94,44.53
AVLR,2018-06-18,44.11,43.71
AVLR,2018-06-19,45.99,45.57
AVLR,2018-06-20,45.01,44.60
AVLR,2018-06-21,51.04,50.57
...,...,...,...
low,2019-09-05,114.21,112.05
low,2019-09-06,114.71,112.54
low,2019-09-09,114.79,112.62
low,2019-09-10,115.28,113.10


In [49]:
sf.rel_change(df=df_prices, freq='bdays',
              weeks=1, months=2, years=3,
              future=False, annualized=True).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Adj. Close
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2010-03-11,-0.006635,-0.006655
A,2010-03-12,-0.009119,-0.009007
A,2010-03-15,-0.007237,-0.007259
A,2010-03-16,-0.005109,-0.005031
A,2010-03-17,-0.005850,-0.005867
...,...,...,...
low,2019-09-05,0.126980,0.149132
low,2019-09-06,0.129754,0.151964
low,2019-09-09,0.123647,0.145742
low,2019-09-10,0.123239,0.145306


In [50]:
TOTAL_RETURN_1_3Y = 'Mean-Log Total Return 1-3 Years'
SHARE_PRICE_1_3Y = 'Mean-Log Share Price 1-3 Years'

In [51]:
# Dict mapping old to new column-names.
new_names_1_3y = {SHARE_PRICE: SHARE_PRICE_1_3Y,
                  TOTAL_RETURN: TOTAL_RETURN_1_3Y}

In [52]:
# Calculate the future mean-log annualized 1-3 year changes.
df_mean_chg = sf.mean_log_change(df=df_prices, freq='bdays',
                                 future=True, annualized=True,
                                 min_years=1, max_years=3,
                                 new_names=new_names_1_3y)

In [53]:
offset = pd.DateOffset(days=60)

# Refresh the fundamental datasets (Income Statements etc.)
# every 30 days.
refresh_days = 30

# Refresh the dataset with shareprices every 10 days.
refresh_days_shareprices = 10

hub = sf.StockHub(market=market, offset=offset,
                  refresh_days=refresh_days,
                  refresh_days_shareprices=refresh_days_shareprices)

# Calculate the mean log-returns for all 1-3 year periods.
df_returns_1_3y = hub.mean_log_returns(name=TOTAL_RETURN_1_3Y,
                         future=True, annualized=True,
                         min_years=1, max_years=3)

Dataset "us-shareprices-daily" on disk (7 days old).
- Loading from disk ... Done!
Cache-file 'mean_log_change-1566ca12.pickle' on disk (6 days old).
- Loading from disk ... Done!


In [54]:
# Combine the signals and stock-returns.
# We are only using the rows which are NetNet discounts.
dfs = [df_signals, df_returns_1_3y]
df_sig_rets = pd.concat(dfs, axis=1)

In [55]:
# Calculate the correlation between all signals and stock-returns.
df_corr = df_sig_rets.corr()

# Show how the signals are correlated with the stock-returns.
# Sorted to show the strongest absolute correlations first.
df_corr_returns = df_corr[TOTAL_RETURN_1_3Y].abs().sort_values(ascending=False)
df_corr_returns

Mean-Log Total Return 1-3 Years      1.000000
Log Revenue                          0.117256
Quick Ratio                          0.078246
Current Ratio                        0.065539
Net Profit Margin                    0.060140
P/Sales                              0.039368
Interest Coverage                    0.032970
Market-Cap                           0.025175
Assets Growth                        0.010158
P/E                                  0.008125
Assets Growth YOY                    0.007493
Sales Growth                         0.006841
P/Cash                               0.005917
FCF Growth QOQ                       0.005269
Sales Growth QOQ                     0.005253
Earnings Growth QOQ                  0.005006
CapEx / (Depr + Amor)                0.004117
Earnings Growth YOY                  0.003748
Sales Growth YOY                     0.003741
Asset Turnover                       0.003135
Share Buyback / FCF                  0.003125
Return on Equity                  

In [56]:
# Only show the large (absolute) correlation values.
df_corr2 = df_corr[df_corr.abs() > 0.7]

# Round correlation values to 2 digits.
df_corr2 = df_corr2.round(2)

# Transform the table to give a better overview.
df_corr2 = df_corr2.stack()

# Remove all values that are 1.0
df_corr2 = df_corr2[df_corr2 != 1.0]

# Show the result. Use a DataFrame for pretty printing.
pd.DataFrame(df_corr2, columns=['Correlation'])

Unnamed: 0,Unnamed: 1,Correlation
(Dividends + Share Buyback) / FCF,Share Buyback / FCF,0.95
Asset Turnover,Return on Equity,0.98
Asset Turnover,FCF Growth YOY,0.94
Current Ratio,Quick Ratio,0.81
Quick Ratio,Current Ratio,0.81
Return on Assets,Return on Equity,0.97
Return on Assets,Earnings Growth YOY,0.99
Return on Assets,FCF Growth YOY,0.94
Return on Equity,Asset Turnover,0.98
Return on Equity,Return on Assets,0.97


In [57]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

def regression_score(df):
    """
    Perform multiple linear-regression on the given data.
    
    df is a DataFrame with signals and returns.
        
    """
    
    # Remove rows where all values are missing.
    df = df.dropna(how='any')
    
    df = df.fillna(0)

    # DataFrame which only contains the signals.
    X = pd.DataFrame(df.drop(columns=[TOTAL_RETURN_1_3Y]))
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # DataFrame which only contains the stock-returns.
    y = df[TOTAL_RETURN_1_3Y]
    
    # Standardize the signals so they have mean 0 and std 1.
    scale = StandardScaler()
    X = scale.fit_transform(X)

    # Perform the regression on this data.
    model = LinearRegression()
    model.fit(X,y)
    
    return model.score(X,y)

In [58]:
# Only use some of the signals.
columns = [TOTAL_RETURN_1_3Y, GROSS_PROFIT_MARGIN, SALES_GROWTH,
           CURRENT_RATIO, LOG_REVENUE, P_NETNET, P_CASH,
           MARKET_CAP, SALES_GROWTH_YOY, ASSETS_GROWTH_QOQ,
           NET_PROFIT_MARGIN, INTEREST_COVERAGE]
df = df_sig_rets[columns]

In [59]:
# Perform the Linear Regression on the signals and stock-returns.
regression_score(df)

0.020253866965262768

Not a very good score, although it is positive at least ;)

Maybe it isn't so simple...

In [60]:
### I used the SimFin documentation as a guide for this notebook