In [None]:
# Import libraries
import pandas as pd
import numpy as np

import shap

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_absolute_error

from random import choice
from tensorflow import keras

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Pick a random seed
SEED = choice(range(1, 1000, 1))

In [None]:
# Load your dataset
source = "[SOURCE]"
df = pd.read_csv(source)
df.head()

In [None]:
# Clean the dataset by trimming of the first/last rows, stock labels and dates
dataset = df[1:]
dataset = dataset.iloc[:, 2:]
dataset = dataset[:-1]

# Drop any remaining NaN values and filter out infinities
dataset = dataset.dropna(axis=1)
dataset = dataset.replace([np.inf, -np.inf], 0)

# Print the dataset's head
dataset.head()

In [None]:
# Value mapping function for predicting the direction of price movements
def mapDirection(value):
    return 1 if value >= 0 else 0

In [None]:
# Create input and output sets
y = dataset["Return"]
X = dataset.drop(["Return"], axis=1)

# Noramise the inputs
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
# Create a random training and test split (9:1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

# Output test stats about the testing set
print(f"Test mean: {np.mean(y_test)}")
print(f"Test stdv: {np.std(y_test)}")

In [None]:
# Conduct standard linear regression on the dataset
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the performance of the linear regression model
preds = model.predict(X_train)
mae = mean_absolute_error(preds, y_train)

print (f"MAE of standard LR: {np.mean(mae)}")

In [None]:
def normaliseVector(vector):
    
    # Find the minimum and maximum values in the vector
    min_val = min(vector)
    max_val = max(vector)
    
    # Normalize the vector using list comprehension
    normalized_vector = [(x - min_val) / (max_val - min_val) for x in vector]

    return normalized_vector

In [None]:
# Type 1 = random forest, type 2 = GBM, type 3 = DNN
model_type = 1

# Number of features to use in MFMs
num_features = 10

# Results storage
ml_errors = []
mfm_errors = []

ml_accuracies = []
mfm_accuracies = []

distances = []
models = []

# Run 10 trials
for i in range(10):

    # Pick a random seed
    SEED = choice(range(1, 10000, 1))

    # Create a random forest
    if (model_type == 1):
        model = RandomForestRegressor(n_estimators=2, max_depth=2, random_state=SEED)
        
    # Create a GBM
    elif (model_type == 2):
        model = GradientBoostingRegressor(n_estimators=128, max_depth=7, random_state=SEED)
    
    # Create a DNN
    else:
        model = keras.Sequential([
            keras.layers.Dense(128, activation='relu', input_shape=(len(X.columns),)),
            keras.layers.Dense(256, activation='relu'),
            keras.layers.Dense(512, activation='relu'),
            keras.layers.Dense(1, activation='linear')
        ])
    
        # Compile the model and specify the optimizer, loss function, and metrics
        model.compile(optimizer='adam', loss='mean_absolute_error')

    # Train the model
    if (model_type != 3):
        model.fit(X_train, y_train)
    
    else:
        model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

    # Evaluate the performance of the ML model
    ml_preds = model.predict(X_test)
    ml_error = mean_absolute_error(ml_preds, y_test)
    ml_errors.append(ml_error)
    
    print(f"MAE of ML model {i}: {ml_error}")
    
    # Get SHAP values
    if (model_type != 3):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test, check_additivity=False)
        
    else:
        explainer = shap.KernelExplainer(model, X_test[0:50])
        shap_values = explainer.shap_values(X_test[0:50], check_additivity=False)[0]

    # Select the top N factors
    features = dict(zip(X.columns, shap_values.mean(axis=0)))
    features = dict(sorted(features.items(), key=lambda item: -abs(item[1])))
    features = {A:N for (A,N) in [x for x in features.items()][:num_features]}
     
    # Store the selected features as one trial
    models.append(list(features.keys()))

    # Subsample the dataset with the selected features
    feature_names = X.columns[np.argsort(abs(shap_values.mean(axis=0)))[::-1][:num_features]]
    X_train_selected = X_train[feature_names]
    X_test_selected = X_test[feature_names]

    # Train a linear regression with the selected features
    lr_model = LinearRegression()
    lr_model.fit(X_train_selected, y_train)

    # Evaluate the performance of the linear regression model
    mfm_preds = lr_model.predict(X_test_selected)
    mfm_error = mean_absolute_error(mfm_preds, y_test)
    mfm_errors.append(mfm_error)

    print(f"MAE of Multi-Factor model {i}: {mfm_error}")

    # Calcualte the model distance (MAE between predictions)
    distance = np.mean(np.abs(mfm_preds - ml_preds))
    distances.append(distance)
    
    print(f"Model Distance: {distance}")

In [None]:
# Print mean results from the trials
print(f"Mean ML MAE {np.mean(ml_errors)}")
print(f"Mean MFM MAE {np.mean(mfm_errors)}")
print(f"Mean distance {np.mean(distances)}")

In [None]:
# Flatten the selected factors from the 10 ML models
packed = [item for sublist in models for item in sublist]

# Initialize an empty dictionary to store the frequencies
frequency_dict = {}

# Construct a frequency dictionary
for item in packed:
    if item in frequency_dict:
        frequency_dict[item] += 1
    else:
        frequency_dict[item] = 1
        
# Sort and display the frequency dictionary
frequency_dict = dict(sorted(frequency_dict.items(), key=lambda item: item[1], reverse=True))
print(frequency_dict)

In [None]:
# Store labels for all factors in the dataset
fund_labels = ['EBITDA','EBITDA Ratio','EPS','Gross Profit Ratio','Cost of Goods Sold','Gross Profit','Cost and Expenses','Interest Expense','Property Plant and Equipment_y','Cost and Expenses_x', 'Cost of Goods Sold_x', 'Depreciation and Amortization_x', 'EBITDA_x', 'EBITDA Ratio_x', 'EPS_x', 'EPS Diluted_x', 'General and Administrative Expenses_x', 'Gross Profit_x', 'Gross Profit Ratio_x', 'Income Before Tax_x', 'Income Before Tax Ratio_x', 'Income Tax Expense_x', 'Interest Expense_x', 'Interest Income_x', 'Net Income_x', 'Net Income Ratio_x', 'Operating Expenses_x', 'Operating Income_x', 'Operating Income Ratio_x', 'Other Expenses_x', 'Research and Development Expenses_x', 'Revenue_x', 'Selling and Marketing Expenses_x', 'Selling, General and Administrative Expenses_x', 'Total Other Income_x', 'Weighted Average Shares_x', 'Weighted Average Shares Diluted_x', 'Accounts Payable', 'Accounts Receivable', 'Accumulated Other Comprehensive Income', 'Capital Lease Obligations', 'Cash and Cash Equivalents', 'Cash and Short Term Investments', 'Common Stock', 'Deferred Revenue', 'Deferred Revenue Non Current', 'Deferred Tax Liabilities', 'Fixed Assets', 'Goodwill', 'Intangible Assets', 'Inventory_x', 'Long Term Debt', 'Long Term Investments', 'Minority Interest', 'Net Debt', 'Other Assets', 'Other Current Assets', 'Other Current Liabilities', 'Other Fixed Assets', 'Other Liabilities', 'Other Non Current Liabilities', 'Other Total Shareholder Equity', 'Preferred Stock', 'Property, Plant and Equipment_x', 'Retained Earnings', 'Short Term Debt', 'Short Term Investments', 'Tax Assets', 'Tax Payables', 'Total Assets', 'Total Current Assets', 'Total Current Liabilities', 'Total Debt', 'Total Equity', 'Total Investments', 'Total Liabilities', 'Total Liabilities and Equity', 'Total Liabilities and Shareholder Equity', 'Total Non Current Liabilities', 'Total Shareholder Equity', 'Accounts Payables', 'Accounts Receivables', 'Acquisitions', 'Capital Expenditure', 'Cash Beginning of Period', 'Cash End of Period', 'Cash Flow from Financing', 'Cash Flow from Investing', 'Cash Flow from Operations', 'Change in Working Capital', 'Common Stock Issued', 'Common Stock Purchased', 'Debt Repayment', 'Deferred Income Tax', 'Depreciation and Amortization_y', 'Dividends Paid', 'Forex Changes on Cash', 'Free Cash Flow', 'Inventory_y', 'Net Change in Cash', 'Net Income_y', 'Operating Cash Flow', 'Other Financing Activities', 'Other Investing Activities', 'Other Non Cash Items', 'Other Working Capital', 'Property, Plant and Equipment_y', 'Purchases of Investments', 'Sales of Investments', 'Stock Based Compensation', 'Cost and Expenses_y', 'Cost of Goods Sold_y', 'Depreciation and Amortization', 'EBITDA_y', 'EBITDA Ratio_y', 'EPS_y', 'EPS Diluted_y', 'General and Administrative Expenses_y', 'Gross Profit_y', 'Gross Profit Ratio_y', 'Income Before Tax_y', 'Income Before Tax Ratio_y', 'Income Tax Expense_y', 'Interest Expense_y', 'Interest Income_y', 'Net Income', 'Net Income Ratio_y', 'Operating Expenses_y', 'Operating Income_y', 'Operating Income Ratio_y', 'Other Expenses_y', 'Research and Development Expenses_y', 'Revenue_y', 'Selling and Marketing Expenses_y', 'Selling, General and Administrative Expenses_y', 'Total Other Income_y', 'Weighted Average Shares_y', 'Weighted Average Shares Diluted_y', 'Accounts Payable Turnover Ratio', 'Asset Turnover Ratio', 'Book Value per Share', 'CAPEX Coverage Ratio', 'CAPEX per Share', 'Cash Conversion Cycle (CCC)', 'Cash Flow Coverage Ratio', 'Cash Ratio', 'Current Ratio', 'Days of Accounts Payable Outstanding (DPO)', 'Days of Inventory Outstanding (DIO)', 'Days of Sales Outstanding (DSO)', 'Debt Service Coverage Ratio', 'Debt-to-Assets Ratio', 'Debt-to-Equity Ratio', 'Dividend CAPEX Coverage Ratio', 'Dividend Yield', 'EBIT to Revenue', 'EBT to EBIT Ratio', 'EV-to-EBIT', 'EV-to-EBITDA', 'EV-to-Operating-Cash-Flow', 'EV-to-Sales', 'Earnings Yield', 'Earnings per Share (EPS)', 'Earnings per Share Growth', 'Effective Tax Rate', 'Enterprise Value', 'Equity Multiplier', 'Fixed Asset Turnover', 'Free Cash Flow Yield', 'Free Cash Flow to Operating Cash Flow Ratio', 'Gross Margin', 'Income Before Tax Profit Margin', 'Income Quality Ratio', 'Interest Coverage Ratio', 'Interest Debt per Share', 'Inventory Turnover Ratio', 'Market Cap', 'Net Current Asset Value', 'Net Income per EBT', 'Net Profit Margin', 'Net-Debt to EBITDA Ratio', 'Operating Cash Flow Ratio', 'Operating Cash Flow to Sales Ratio', 'Operating Cycle (CC)', 'Operating Margin', 'Operating Ratio', 'Payout Ratio', 'Price-to-Book (PB)', 'Price-to-Cash-Flow (P/CF)', 'Price-to-Earnings (PE)', 'Price-to-Earnings-Growth (PEG)', 'Price-to-Free-Cash-Flow (P/FCF)', 'Quick Ratio', 'Receivables Turnover', 'Return on Assets (ROA)', 'Return on Capital Employed (ROCE)', 'Return on Equity (ROE)', 'Return on Invested Capital (ROIC)', 'Return on Tangible Assets', 'Revenue per Share (RPS)', 'SGA-to-Revenue Ratio', 'Short Term Coverage Ratio', 'Tangible Asset Value', 'Weighted Dividend Yield', 'Working Capital']
fund_growth_labels =['Interest Income (Growth)','Other Expenses (Growth)','Research and Development Expenses (Growth)','Interest Expense (Growth)','Operating Income (Growth)','Cost and Expenses_x (Growth)', 'Cost of Goods Sold_x (Growth)', 'Depreciation and Amortization_x (Growth)', 'EBITDA_x (Growth)', 'EBITDA Ratio_x (Growth)', 'EPS_x (Growth)', 'EPS Diluted_x (Growth)', 'General and Administrative Expenses_x (Growth)', 'Gross Profit_x (Growth)', 'Gross Profit Ratio_x (Growth)', 'Income Before Tax_x (Growth)', 'Income Before Tax Ratio_x (Growth)', 'Income Tax Expense_x (Growth)', 'Interest Expense_x (Growth)', 'Interest Income_x (Growth)', 'Net Income_x (Growth)', 'Net Income Ratio_x (Growth)', 'Operating Expenses_x (Growth)', 'Operating Income_x (Growth)', 'Operating Income Ratio_x (Growth)', 'Other Expenses_x (Growth)', 'Research and Development Expenses_x (Growth)', 'Revenue_x (Growth)', 'Selling and Marketing Expenses_x (Growth)', 'Selling, General and Administrative Expenses_x (Growth)', 'Total Other Income_x (Growth)', 'Weighted Average Shares_x (Growth)', 'Weighted Average Shares Diluted_x (Growth)', 'Accounts Payable (Growth)', 'Accounts Receivable (Growth)', 'Accumulated Other Comprehensive Income (Growth)', 'Capital Lease Obligations (Growth)', 'Cash and Cash Equivalents (Growth)', 'Cash and Short Term Investments (Growth)', 'Common Stock (Growth)', 'Deferred Revenue (Growth)', 'Deferred Revenue Non Current (Growth)', 'Deferred Tax Liabilities (Growth)', 'Fixed Assets (Growth)', 'Goodwill (Growth)', 'Intangible Assets (Growth)', 'Inventory_x (Growth)', 'Long Term Debt (Growth)', 'Long Term Investments (Growth)', 'Minority Interest (Growth)', 'Net Debt (Growth)', 'Other Assets (Growth)', 'Other Current Assets (Growth)', 'Other Current Liabilities (Growth)', 'Other Fixed Assets (Growth)', 'Other Liabilities (Growth)', 'Other Non Current Liabilities (Growth)', 'Other Total Shareholder Equity (Growth)', 'Preferred Stock (Growth)', 'Property, Plant and Equipment_x (Growth)', 'Retained Earnings (Growth)', 'Short Term Debt (Growth)', 'Short Term Investments (Growth)', 'Tax Assets (Growth)', 'Tax Payables (Growth)', 'Total Assets (Growth)', 'Total Current Assets (Growth)', 'Total Current Liabilities (Growth)', 'Total Debt (Growth)', 'Total Equity (Growth)', 'Total Investments (Growth)', 'Total Liabilities (Growth)', 'Total Liabilities and Equity (Growth)', 'Total Liabilities and Shareholder Equity (Growth)', 'Total Non Current Liabilities (Growth)', 'Total Shareholder Equity (Growth)', 'Accounts Payables (Growth)', 'Accounts Receivables (Growth)', 'Acquisitions (Growth)', 'Capital Expenditure (Growth)', 'Cash Beginning of Period (Growth)', 'Cash End of Period (Growth)', 'Cash Flow from Financing (Growth)', 'Cash Flow from Investing (Growth)', 'Cash Flow from Operations (Growth)', 'Change in Working Capital (Growth)', 'Common Stock Issued (Growth)', 'Common Stock Purchased (Growth)', 'Debt Repayment (Growth)', 'Deferred Income Tax (Growth)', 'Depreciation and Amortization_y (Growth)', 'Dividends Paid (Growth)', 'Forex Changes on Cash (Growth)', 'Free Cash Flow (Growth)', 'Inventory_y (Growth)', 'Net Change in Cash (Growth)', 'Net Income_y (Growth)', 'Operating Cash Flow (Growth)', 'Other Financing Activities (Growth)', 'Other Investing Activities (Growth)', 'Other Non Cash Items (Growth)', 'Other Working Capital (Growth)', 'Property, Plant and Equipment_y (Growth)', 'Purchases of Investments (Growth)', 'Sales of Investments (Growth)', 'Stock Based Compensation (Growth)', 'Cost and Expenses_y (Growth)', 'Cost of Goods Sold_y (Growth)', 'Depreciation and Amortization (Growth)', 'EBITDA_y (Growth)', 'EBITDA Ratio_y (Growth)', 'EPS_y (Growth)', 'EPS Diluted_y (Growth)', 'General and Administrative Expenses_y (Growth)', 'Gross Profit_y (Growth)', 'Gross Profit Ratio_y (Growth)', 'Income Before Tax_y (Growth)', 'Income Before Tax Ratio_y (Growth)', 'Income Tax Expense_y (Growth)', 'Interest Expense_y (Growth)', 'Interest Income_y (Growth)', 'Net Income (Growth)', 'Net Income Ratio_y (Growth)', 'Operating Expenses_y (Growth)', 'Operating Income_y (Growth)', 'Operating Income Ratio_y (Growth)', 'Other Expenses_y (Growth)', 'Research and Development Expenses_y (Growth)', 'Revenue_y (Growth)', 'Selling and Marketing Expenses_y (Growth)', 'Selling, General and Administrative Expenses_y (Growth)', 'Total Other Income_y (Growth)', 'Weighted Average Shares_y (Growth)', 'Weighted Average Shares Diluted_y (Growth)', 'Accounts Payable Turnover Ratio (Growth)', 'Asset Turnover Ratio (Growth)', 'Book Value per Share (Growth)', 'CAPEX Coverage Ratio (Growth)', 'CAPEX per Share (Growth)', 'Cash Conversion Cycle (CCC) (Growth)', 'Cash Flow Coverage Ratio (Growth)', 'Cash Ratio (Growth)', 'Current Ratio (Growth)', 'Days of Accounts Payable Outstanding (DPO) (Growth)', 'Days of Inventory Outstanding (DIO) (Growth)', 'Days of Sales Outstanding (DSO) (Growth)', 'Debt Service Coverage Ratio (Growth)', 'Debt-to-Assets Ratio (Growth)', 'Debt-to-Equity Ratio (Growth)', 'Dividend CAPEX Coverage Ratio (Growth)', 'Dividend Yield (Growth)', 'EBIT to Revenue (Growth)', 'EBT to EBIT Ratio (Growth)', 'EV-to-EBIT (Growth)', 'EV-to-EBITDA (Growth)', 'EV-to-Operating-Cash-Flow (Growth)', 'EV-to-Sales (Growth)', 'Earnings Yield (Growth)', 'Earnings per Share (EPS) (Growth)', 'Earnings per Share Growth (Growth)', 'Effective Tax Rate (Growth)', 'Enterprise Value (Growth)', 'Equity Multiplier (Growth)', 'Fixed Asset Turnover (Growth)', 'Free Cash Flow Yield (Growth)', 'Free Cash Flow to Operating Cash Flow Ratio (Growth)', 'Gross Margin (Growth)', 'Income Before Tax Profit Margin (Growth)', 'Income Quality Ratio (Growth)', 'Interest Coverage Ratio (Growth)', 'Interest Debt per Share (Growth)', 'Inventory Turnover Ratio (Growth)', 'Market Cap (Growth)', 'Net Current Asset Value (Growth)', 'Net Income per EBT (Growth)', 'Net Profit Margin (Growth)', 'Net-Debt to EBITDA Ratio (Growth)', 'Operating Cash Flow Ratio (Growth)', 'Operating Cash Flow to Sales Ratio (Growth)', 'Operating Cycle (CC) (Growth)', 'Operating Margin (Growth)', 'Operating Ratio (Growth)', 'Payout Ratio (Growth)', 'Price-to-Book (PB) (Growth)', 'Price-to-Cash-Flow (P/CF) (Growth)', 'Price-to-Earnings (PE) (Growth)', 'Price-to-Earnings-Growth (PEG) (Growth)', 'Price-to-Free-Cash-Flow (P/FCF) (Growth)', 'Quick Ratio (Growth)', 'Receivables Turnover (Growth)', 'Return on Assets (ROA) (Growth)', 'Return on Capital Employed (ROCE) (Growth)', 'Return on Equity (ROE) (Growth)', 'Return on Invested Capital (ROIC) (Growth)', 'Return on Tangible Assets (Growth)', 'Revenue per Share (RPS) (Growth)', 'SGA-to-Revenue Ratio (Growth)', 'Short Term Coverage Ratio (Growth)', 'Tangible Asset Value (Growth)', 'Weighted Dividend Yield (Growth)', 'Working Capital (Growth)']
tech_labels = ['Close','McClellan Oscillator', 'Advancers - Decliners', 'On-Balance Volume', 'Accumulation/Distribution Line', 'Chaikin Oscillator', 'Money Flow Index', 'Williams %R', 'Aroon Indicator Up', 'Aroon Indicator Down', 'Commodity Channel Index', 'Relative Vigor Index', 'Force Index', 'Ultimate Oscillator', 'Percentage Price Oscillator', 'Detrended Price Oscillator', 'Average Directional Index', 'Chande Momentum Oscillator', 'Ichimoku Conversion Line', 'Ichimoku Base Line', 'Ichimoku Leading Span A', 'Ichimoku Leading Span B', 'Stochastic %K', 'Stochastic %D', 'MACD Line', 'MACD Signal Line', 'Relative Strength Index', 'Balance of Power', 'Simple Moving Average (SMA)', 'Exponential Moving Average (EMA)', 'Double Exponential Moving Average (DEMA)', 'TRIX', 'Triangular Moving Average', 'Bollinger Band Upper', 'Bollinger Band Middle', 'Bollinger Band Lower', 'True Range', 'Average True Range', 'Keltner Channel Upper', 'Keltner Channel Middle', 'Keltner Channel Lower']

# Plot a column chart of factor selection frequencies
def visualiseFeatures(fd):

    # Unpack the features and associated colors
    categories = list(fd.keys())
    heights = list(fd.values())
    colors = ['#E62727' if c in fund_labels else '#FFD500' if c in fund_growth_labels else '#008CFF' for c in categories]

    # Create the bar graph
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=categories, y=heights, palette=colors)
    plt.xticks(rotation=90)

    # Add labels to the axes
    plt.xlabel('Factors')
    plt.ylabel('Frequency')
    
    # Add title
    plt.title('Selected Factors for the multi-factor model')
    plt.show()

visualiseFeatures(frequency_dict)

In [None]:
# Calculate the percentage frequency of each find of factor in our multi-factor models
def analyseModelCompostion(fd):
    
    # Total # of factors
    count = 0
    
    # Fundamental factors count
    fund = 0
    
    # Fundamental growth rates count
    fund_gro = 0
    
    # Technical factors count
    tech = 0
    
    # Create a tally of each kind of factor across all truals
    for key, val in fd.items():
        
        if (key in fund_labels):
            fund += val
            
        elif (key in fund_growth_labels):
            fund_gro += val
        
        elif (key in tech_labels):
            tech += val
            
        else:
            print(f"Key not found: {key}")
    
        count += val

    print(f"% of fundamental factors: {fund/count}")
    print(f"% of fundamental growth rates: {fund_gro/count}")
    print(f"% of technical factors: {tech/count}")

analyseModelCompostion(frequency_dict)