In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import datetime

In [2]:
def drop_outliers(df, threshold=5):
    # Calculate Z-scores
    zscores = df.apply(zscore)
    # Find absolute Z-scores greater than the threshold
    outliers = (zscores.abs() > threshold).any(axis=1)
    df_clean = df[~outliers]
    return df_clean

In [3]:
spring = pd.read_excel('factor_spring_3m.xlsx')
summer = pd.read_excel('factor_summer_3m.xlsx')
autumn = pd.read_excel('factor_autumn_3m.xlsx')
winter = pd.read_excel('factor_winter_3m.xlsx')
df_all = pd.read_excel('factor_all_n1.xlsx')

In [4]:
X = drop_outliers(df_all[summer.columns[4:]].drop_duplicates())
y = df_all.loc[X.index,spring.columns[2]]
X_std = X.apply(zscore)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [6]:
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [179]:
cov_matrix = X_std.cov()

In [180]:
eigen_vals, eigen_vecs = np.linalg.eig(cov_matrix)
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
matrix_w = np.column_stack((eigen_pairs[0][1], eigen_pairs[1][1], eigen_pairs[2][1]))
X_pca = X_std.dot(matrix_w)

In [224]:
X_train, X_test, y_train, y_test = train_test_split(pca.transform(X_std), y, test_size=0.20)
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.11340800495057679

In [241]:
def find_loadings(dataset):
    X = drop_outliers(dataset[dataset.columns[4:]].drop_duplicates())
    X_std = X.apply(zscore)
    pca = PCA(n_components = 0.9)
    pca.fit(X_std)

    variables_names = X_std.columns.tolist()

    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    loadings_squared = loadings ** 2
    loadings_squared_sum = np.sum(loadings_squared, axis=1)

    # Create a DataFrame with factor names and their corresponding loadings
    factor_loadings = pd.DataFrame({
        'FactorName': variables_names,
        'LoadingsSquaredSum': loadings_squared_sum
    })
    factor_loadings = factor_loadings.sort_values(by='LoadingsSquaredSum', ascending=False).reset_index(drop=True)
    return factor_loadings


In [246]:
data = find_loadings(spring)
summer1 = find_loadings(summer)
autumn1 = find_loadings(autumn)
winter1 = find_loadings(winter)

In [248]:
data['cum'] = (summer1['LoadingsSquaredSum'] / sum(summer1['LoadingsSquaredSum'])) + (data['LoadingsSquaredSum'] / sum(data['LoadingsSquaredSum'])) + (autumn1['LoadingsSquaredSum'] / sum(autumn1['LoadingsSquaredSum'])) + (winter1['LoadingsSquaredSum'] / sum(winter1['LoadingsSquaredSum']))

In [249]:
data

Unnamed: 0,FactorName,LoadingsSquaredSum,cum
0,m3_timing_ratio,0.99989,0.182459
1,m3_dev_downside_rf_a,0.987047,0.180103
2,m3_dev_downside_avg_a,0.985689,0.179706
3,m3_beta,0.983585,0.17909
4,m3_stdev_a,0.974867,0.177897
5,m3_kurtosis,0.97078,0.176355
6,m3_sortino_a,0.969781,0.175619
7,m3_beta_upside,0.959998,0.175
8,m3_sharpe_a,0.957211,0.174388
9,m3_excess_mdd,0.942702,0.17297


In [263]:
reg1 = RandomForestRegressor(max_depth=20, max_leaf_nodes=40)
reg1.fit(X_train, y_train)
reg1.score(X_test, y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [13]:
str(datetime.datetime.now())[:10]

'2023-07-25'