<a href="https://colab.research.google.com/github/QuinnG17/Financial-Programs/blob/main/MODEL_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Read long signal return information
long_df = spark.read.table("alpha_two.all_date_long_metrics_v2")
long_data = long_df.toPandas()
long_data['Date'] = pd.to_datetime(long_data['Date'])

# Read short signal return information
short_df = spark.read.table("alpha_two.all_date_short_metrics_v2")
short_data = short_df.toPandas()
short_data['Date'] = pd.to_datetime(short_data['Date'])

#read Signals
table_df = spark.read.table("kash.signal_shifted")
signals = table_df.toPandas()
signals['Date'] = pd.to_datetime(signals['Date'])

#Concat data for Target column calculation
all_data = pd.concat([long_data, short_data])
data = pd.merge(all_data, signals, on='Date', how='inner')
data['Signal'] = data['Signal'].replace(2, -1)

data = data.sort_values(by='Date')

# Subtract the next row by the current row within each group
data['NextValue'] = data.groupby('Symbol')['Net_Profit'].shift(-1)
data['Target'] = data['NextValue'] - data['Net_Profit']

print(long_data.columns)
print(data.columns)

#Separate Target Values back out
long_data = long_data.merge(data[['Symbol', 'Date', 'NextValue', 'Signal', 'Target']], on=['Date', 'Symbol'], how='inner')
short_data = short_data.merge(data[['Symbol', 'Date', 'NextValue', 'Signal', 'Target']], on=['Date','Symbol'], how='inner')

long_data = long_data.sort_values('Date')
short_data = short_data.sort_values('Date')

print(long_data)











In [None]:
from datetime import timedelta

for date in data['Date'].head():
    print(date.date() - timedelta(days=60))

In [None]:
#Execute to analyze model performance per symbol
import numpy as np
import itertools
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from scipy.stats import norm
import pandas as pd
from datetime import timedelta
from datetime import datetime, date
from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance


def measure_best_features(data, models, weights_df):
    """
    Measure the best features for each stock at each backtest date using different feature selection methods.
    """
    best_models = []
    unique_symbols = data['Symbol'].unique()

    if data['Signal'].iloc[-1] == 1:
        direction = 'Long'
        print(direction)

    elif data['Signal'].iloc[-1] == -1:
        direction = 'Short'
    else:
        direction = None

    i = 0
    for symbol in unique_symbols:

        symbol_data = data[data['Symbol'] == symbol]

        symbol_data = symbol_data.dropna()

        if len(symbol_data) < 100:
            continue

        # Separate features and target
        X = symbol_data.drop(['Date', 'Symbol', 'Target', 'NextValue'], axis=1)
        #print(X)
        y = symbol_data['Target']
        #print(y)

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        # Perform feature selection using different methods

        print(f"running models @ {symbol}")
        for model in models:
            print(f"running {model}")
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            accuracy = accuracy_above_threshold(y_val, y_pred, threshold=0)

            if isinstance(model, SVR):
                # Calculate permutation importances
                importances = permutation_importance(model, X, y)

                # Get the feature importances
                weights = importances.importances_mean

            elif isinstance(model, LinearRegression):
                weights = model.coef_
            else:
                weights = model.feature_importances_

            weights_df_temp = pd.DataFrame([weights], columns=X.columns.tolist())
            weights_df_temp['Symbol'] = symbol
            weights_df_temp['Model'] = model.__class__.__name__
            weights_df_temp['RMSE'] = rmse / 100
            weights_df_temp['Accuracy'] = accuracy
            weights_df_temp['Direction'] = direction
            weights_df = weights_df.append(weights_df_temp)

        print(i / len(unique_symbols))
        i += 1

    return weights_df

def accuracy_above_threshold(y_true, y_pred, threshold=0):
    y_true_binary = y_true > threshold
    y_pred_binary = y_pred > threshold
    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    return accuracy


# Define your list of ML estimator models
models = [
     RandomForestRegressor(),
     #SVR(),
     LinearRegression(),
     DecisionTreeRegressor(),
     AdaBoostRegressor(),
     GradientBoostingRegressor(),
     ExtraTreesRegressor()
 ]

weights_df = pd.DataFrame()

weights_df = measure_best_features(long_data, models, weights_df)
weights_df = measure_best_features(short_data, models, weights_df)

   # Create a SparkSession
spark = SparkSession.builder.appName('pandas-to-databricks').getOrCreate()

# Convert the Pandas DataFrame to a Databricks DataFrame
db_df = spark.createDataFrame(weights_df)

db_df.createOrReplaceTempView('model_weights')

db_df.show()



In [None]:
#Execute to analyze model performance per date

import numpy as np
import itertools
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from scipy.stats import norm
import pandas as pd
from datetime import timedelta
from datetime import datetime, date
from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance


def measure_best_features(data, models, weights_df):
    """
    Measure the best features for each stock at each backtest date using different feature selection methods.
    """
    best_models = []
    unique_date = data['Date'].unique()

    if data['Signal'].iloc[-1] == 1:
        direction = 'Long'
        print(direction)

    elif data['Signal'].iloc[-1] == -1:
        direction = 'Short'
    else:
        direction = None

    i = 0
    for date in unique_date:

        date_data = data[data['Date'] == date]

        date_data = date_data.dropna()

        if len(date_data) < 100:
            continue

        # Separate features and target
        X = date_data.drop(['Date', 'Symbol', 'Target', 'NextValue'], axis=1)
        #print(X)
        y = date_data['Target']
        #print(y)

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        # Perform feature selection using different methods

        print(f"running models @ {date}")
        for model in models:
            print(f"running {model}")
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))

            accuracy = accuracy_above_threshold(y_val, y_pred, threshold=0)

            if isinstance(model, SVR):
                # Calculate permutation importances
                importances = permutation_importance(model, X, y)

                # Get the feature importances
                weights = importances.importances_mean

            elif isinstance(model, LinearRegression):
                weights = model.coef_
            else:
                weights = model.feature_importances_

            weights_df_temp = pd.DataFrame([weights], columns=X.columns.tolist())
            weights_df_temp['Date'] = date
            weights_df_temp['Model'] = model.__class__.__name__
            weights_df_temp['RMSE'] = rmse / 100
            weights_df_temp['Accuracy'] = accuracy
            weights_df_temp['Direction'] = direction
            weights_df = weights_df.append(weights_df_temp)

        print(i / len(unique_date))
        i += 1

    return weights_df

def accuracy_above_threshold(y_true, y_pred, threshold=0):
    y_true_binary = y_true > threshold
    y_pred_binary = y_pred > threshold
    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    return accuracy


# Define your list of ML estimator models
models = [
     RandomForestRegressor(),
     #SVR(),
     LinearRegression(),
     DecisionTreeRegressor(),
     AdaBoostRegressor(),
     GradientBoostingRegressor(),
     ExtraTreesRegressor()
 ]

weights_df = pd.DataFrame()

weights_df = measure_best_features(long_data, models, weights_df)
weights_df = measure_best_features(short_data, models, weights_df)

   # Create a SparkSession
spark = SparkSession.builder.appName('pandas-to-databricks').getOrCreate()

# Convert the Pandas DataFrame to a Databricks DataFrame
db_df = spark.createDataFrame(weights_df)

db_df.createOrReplaceTempView('model_weights')

db_df.show()



In [None]:
%sql create or replace table alpha_two.DAILY_MODEL_ANALYSIS as select * from model_weights

In [None]:
%sql select * from alpha_two.DAILY_MODEL_ANALYSIS where Direction = 'Long'

In [None]:
data = spark.read.table("alpha_two.Regression_Model_Meta_Analysis_Combined_Next_Day")

data = data.toPandas()

data = data[data['Accuracy'] != 1]

# Group the data by 'Column' and calculate the average accuracy of the top decile per group
result = data.groupby('Model').apply(lambda x: x[x['Accuracy'] >= x['Accuracy'].quantile(0.9)]['Accuracy'].mean())

# Display the average accuracy per column group
print(result)

In [None]:
%sql select * from alpha_two.DAILY_MODEL_ANALYSIS