In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.metrics import MeanSquaredError, Accuracy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb
import re
import pickle
import lightgbm as lgb
pd.set_option('display.max_columns', None)

In [2]:

def load_and_preprocess_data(filename, date_col, start_date, seperator, fill, lim):
    df_read = pd.read_csv(filename, sep=seperator)
    
    # Convert the date column to datetime and set it as the index
    df_read[date_col] = pd.to_datetime(df_read[date_col])
    df_read.set_index(date_col, inplace=True)
    
    # Data cleaning: replace commas with periods and convert to float
    for column in df_read.columns:
        if df_read[column].dtype == 'object':
            df_read[column] = df_read[column].str.replace(',', '.').astype(float)
    
    # Start dataset from start date
    df_filtered = df_read[start_date:]

    # Fill missing values
    df_filtered.fillna(fill, limit=lim)
    
    return df_filtered


In [3]:


def add_features(df, windows):

    feature_cols = [col for col in df.columns if 'macro' not in col.lower()]
    macro_cols = [col for col in df.columns if 'MACRO' in col.upper()]
    
    # Initialize a dictionary to hold all the new feature data
    features_dict = {}


    # Perform rolling calculations for each window size
    for w in windows:
        for col in feature_cols:
            # Create unique feature names for each statistic and window size
            features_dict[f'{col}_VaR_{w}']         = df[col].rolling(window=w, min_periods=int(w//2)).quantile(0.05)
            features_dict[f'{col}_momentum_{w}']    = df[col].rolling(window=w, min_periods=int(w//2)).sum()
            features_dict[f'{col}_avgreturn_{w}']   = df[col].rolling(window=w, min_periods=int(w//2)).mean()
            features_dict[f'{col}_skew_{w}']        = df[col].rolling(window=w, min_periods=int(w//2)).skew()
            features_dict[f'{col}_volatility_{w}']  = df[col].rolling(window=w, min_periods=int(w//2)).std()

    # Convert the dictionary of Series to a DataFrame
    features_df = pd.DataFrame(features_dict, index=df.index)

    # Concatenate 'MACRO' columns to the features DataFrame
    if macro_cols:
        macro_df = df[macro_cols]
        features_df = pd.concat([features_df, macro_df], axis=1)

    return features_df



In [4]:

def transform_and_pivot_df(df, date_col):
    # Reset the index to make the date a regular column
    df_reset = df.reset_index()
    
    # Melt the DataFrame to long format
    long_df = df_reset.melt(id_vars=date_col, var_name='metric', value_name='value')
    
    # Split the 'metric' column to extract components
    split_metrics = long_df['metric'].str.split('_', expand=True)
    
    # Identify 'MACRO' rows
    macro_mask = split_metrics[0] == 'MACRO'
    
    # For non-'MACRO' metrics, define 'asset' and 'metric_type'
    long_df['asset'] = split_metrics[0] + '_' + split_metrics[1]
    long_df['metric_type'] = split_metrics[2] + '_' + split_metrics[3]

    # Reset index on the left-hand side DataFrame slice to ensure alignment
    lhs = long_df.loc[macro_mask, 'metric_type'].reset_index(drop=True)

    # Reset index on the right-hand side Series to ensure alignment
    rhs = (split_metrics.loc[macro_mask, 0] + '_' + split_metrics.loc[macro_mask, 1]).reset_index(drop=True)

    # Assign the values after ensuring both sides have the same length
    lhs = rhs

    # Assign the modified Series back to the original DataFrame (if needed)
    long_df.loc[macro_mask, 'metric_type'] = lhs.values
    
    # For 'MACRO' metrics, adjust 'metric_type' and 'asset'
    long_df.loc[macro_mask, 'metric_type'] = split_metrics.loc[macro_mask, 0] + '_' + split_metrics.loc[macro_mask, 1]
    long_df.loc[macro_mask, 'asset'] = 'MACRO'
    
    # Remove 'MACRO' placeholder rows
    long_df = long_df[long_df['asset'] != 'MACRO']
    
    # Pivot the DataFrame back to wide format
    final_df = long_df.pivot_table(index=[date_col, 'asset'], columns='metric_type', values='value').reset_index()
    
    # Handle 'MACRO' metrics separately
    macro_df = df.filter(regex='^MACRO').copy()
    macro_df[date_col] = df_reset[date_col]
    
    # Merge 'MACRO' metrics back into the final DataFrame
    if date_col in macro_df.columns:
        macro_df = macro_df.drop(columns=date_col)
    
    macro_df = macro_df.reset_index()
    final_df = pd.merge(final_df, macro_df, on=date_col, how='left')
    
    return final_df


In [49]:
def add_y_col(df, date_col, target_days, return_col, volatility_col):
    # Shift the specified return and volatility columns by the target number of days
    df[f'{return_col}_shifted'] = df.groupby('asset')[return_col].shift(-target_days)
    df[f'{volatility_col}_shifted'] = df.groupby('asset')[volatility_col].shift(-target_days)
    print(df.tail(20))

    # Drop rows with NaN values that result from the shift operation
    df = df.dropna()
    # Calculate the Sharpe ratio by dividing the shifted return by the shifted volatility
    df['sharpe_ratio'] = df[f'{return_col}_shifted'] / df[f'{volatility_col}_shifted']
    
    # Calculate the mean Sharpe ratio for each date and merge it back into the DataFrame
    sharpe_ratio_mean = df.groupby(date_col)['sharpe_ratio'].mean().rename('sharpe_ratio_mean')
    df = df.merge(sharpe_ratio_mean, on=date_col)
    
    # Create a new binary column 'Y', indicating whether the Sharpe ratio is above the mean for its date
    df['Y'] = np.where(df['sharpe_ratio'] > df['sharpe_ratio_mean'], 1, 0)

    df = df.drop(columns=['sharpe_ratio', 'sharpe_ratio_mean', f'{return_col}_shifted', f'{volatility_col}_shifted', return_col, volatility_col])

    
    return df

In [40]:

def combined_process(df_read, df_all, date_col, target_days):
    # Step 1: 
    
    df = add_feature_for_sharpe(df_read, [target_days])
    # Step 2: Transformation and Pivoting (Assuming 'transform_and_pivot_df' function is defined)
    
    
    transformed_df = transform_and_pivot_df(df, date_col)

    # Step 3: Adding 'Y' Column
    # Select a representative return and volatility column generated from Step 1 to use in 'add_y_col' function
    # Assuming the return and volatility columns follow a naming pattern similar to the original columns
    return_col = f'avgreturn_{target_days}'  # Adjust 'some_feature' based on actual column names
    volatility_col = f'volatility_{target_days}'  # Adjust 'some_feature' based on actual column names
    
    df_combined = pd.concat([transformed_df, df_all])

    
    
    final_df = add_y_col(df_combined, date_col, target_days, return_col, volatility_col)

    return final_df



In [24]:

def add_feature_for_sharpe(df, windows):

    feature_cols = [col for col in df.columns if 'macro' not in col.lower()]
    
    # Initialize a dictionary to hold all the new feature data
    features_dict = {}


    # Perform rolling calculations for each window size
    for w in windows:
        for col in feature_cols:
            # Create unique feature names for each statistic and window size
            features_dict[f'{col}_avgreturn_{w}']   = df[col].rolling(window=w, min_periods=int(w//2)).mean()
            features_dict[f'{col}_volatility_{w}']  = df[col].rolling(window=w, min_periods=int(w//2)).std()

    # Convert the dictionary of Series to a DataFrame
    features_df = pd.DataFrame(features_dict, index=df.index)

    return features_df



In [8]:


def prepare_training_dataset(df, date_col, shuffle=False, train_split=0.25, eval_split=0.25):

    
    # Separate features and target variable
    X = df.drop(columns=['Y', 'asset'])

    y = df['Y']

    # Convert date column to datetime if not already done
    X[date_col] = pd.to_datetime(X[date_col])

    if shuffle:
        # Split the data randomly
        train_size = 1 - train_split
        X_temp, X_train, y_temp, y_train = train_test_split(X, y, train_size=train_size)
        X_eval, X_test, y_eval, y_test = train_test_split(X_temp, y_temp, test_size=(2/3))
    else:
        # Split the data sequentially
        train_end_idx = int(len(X) * train_split)
        eval_end_idx = train_end_idx + int(len(X) * eval_split)

        X_train = X.iloc[:train_end_idx]
        y_train = y.iloc[:train_end_idx]
        X_eval = X.iloc[train_end_idx:eval_end_idx]
        y_eval = y.iloc[train_end_idx:eval_end_idx]
        X_test = X.iloc[eval_end_idx:]
        y_test = y.iloc[eval_end_idx:]

    # Drop the date column
    X_train = X_train.drop(date_col, axis=1)
    X_eval = X_eval.drop(date_col, axis=1)
    X_test = X_test.drop(date_col, axis=1)

    # Combine training and evaluation sets for the final model training
    X_train_eval = pd.concat([X_train, X_eval])
    y_train_eval = pd.concat([y_train, y_eval])


    return X_train, X_eval, X_test, y_train, y_eval, y_test, X_train_eval, y_train_eval




In [9]:


def optimize_and_train_ridge(X_train, y_train, X_train_eval, y_train_eval, param_grid, scoring='accuracy', cv=5):

    model = RidgeClassifier()

    # Initialize GridSearchCV with the provided model and parameter grid
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=cv)
    
    # Fit GridSearchCV on the training set
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters and the accuracy on the evaluation set
    print("Best parameters:", grid_search.best_params_)
    print("Best accuracy on evaluation set:", grid_search.best_score_)
    
    # Retrain the model with the best parameters on the combined training and evaluation sets
    model_best = model.__class__(**grid_search.best_params_)
    model_best.fit(X_train_eval, y_train_eval)

    return model_best, grid_search





In [10]:


def evaluate_model_performance(y_true, y_pred):
 

    conf_matrix = confusion_matrix(y_true, y_pred)


    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    # Print the performance metrics
    print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")

    return conf_matrix, precision, recall, f1, mse, rmse



In [11]:


def optimize_and_train_xgb(X_train, y_train, X_eval, y_eval, param_grid, scoring='accuracy', cv=5, n_jobs=-1, early_stopping_rounds=10):

    # Initialize the XGBoost model
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Perform grid search
    grid_search = GridSearchCV(xgb_model, param_grid, scoring=scoring, cv=cv, n_jobs=n_jobs)
    grid_search.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], early_stopping_rounds=early_stopping_rounds, verbose=False)

    # Extract best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    # Retrain the model with the best parameters on the combined training and evaluation set
    xgb_best = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
    xgb_best.fit(pd.concat([X_train, X_eval]), pd.concat([y_train, y_eval]))

    return xgb_best, best_params

# Define param_grid


In [12]:


def optimize_and_train_lgb(X_train, y_train, X_eval, y_eval, param_grid, scoring='accuracy', cv=5, n_jobs=-1):

    # Initialize the LightGBM model
    lgb_model = lgb.LGBMClassifier()

    # Perform grid search
    grid_search = GridSearchCV(lgb_model, param_grid, scoring=scoring, cv=cv, n_jobs=n_jobs)
    grid_search.fit(X_train, y_train)

    # Extract best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    # Retrain the model with the best parameters on the combined training and evaluation set
    lgb_best = lgb.LGBMClassifier(**best_params)
    lgb_best.fit(pd.concat([X_train, X_eval]), pd.concat([y_train, y_eval]), eval_set=[(X_eval, y_eval)])

    return lgb_best, best_params




In [13]:


def train_and_evaluate_NN(X_train_eval, y_train_eval, X_eval, y_eval, X_test, y_test, epochs=50, batch_size=32):

    # Initialize the scaler and scale the data
    scaler = StandardScaler()
    # undersök data leakage här
    X_train_eval_scaled = scaler.fit_transform(X_train_eval)
    X_eval_scaled = scaler.transform(X_eval)
    X_test_scaled = scaler.transform(X_test)

    # Define the model architecture
    #undersök relu
    model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_eval_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'), 
    Dense(8, activation='relu'),   
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train_eval_scaled, y_train_eval,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_eval_scaled, y_eval)
    )
    return model, history, X_test_scaled




In [27]:
filename = 'all_data_anonymized.csv'
dateCol = 'todate'
start_date = '1980-01-01'
seperator = ';'
fill = 0
lim = 5
df_read = load_and_preprocess_data(filename, dateCol, start_date, seperator, fill, lim)


  df_read = pd.read_csv(filename, sep=seperator)


In [28]:
df_read

Unnamed: 0_level_0,EQ_0,EQ_1,EQ_2,EQ_3,EQ_4,EQ_5,EQ_6,EQ_7,EQ_8,EQ_9,EQ_10,EQ_11,EQ_12,EQ_13,EQ_14,EQ_15,EQ_16,EQS_0,EQS_1,EQS_2,EQS_3,EQS_4,EQS_5,EQS_6,EQS_7,EQS_8,EQS_9,EQS_10,FI_0,FI_1,FI_2,FI_3,FI_4,FI_5,FI_6,FI_7,FI_8,FI_9,FI_10,FI_11,FI_12,FI_13,FI_14,FXD_0,FXD_1,FXD_2,FXD_3,FXD_4,FXD_5,FXD_6,FXD_7,FXD_8,FXD_9,FXE_0,FXE_1,FXE_2,FXE_3,FXE_4,MACRO_8,MACRO_0,MACRO_1,MACRO_2,MACRO_3,MACRO_4,MACRO_5,MACRO_6,MACRO_7
todate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
1980-01-02,,-0.018988,,,,,,,-0.018316,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.046210,,0.010604,,-0.005429,-0.041021,-0.021920,-0.019289
1980-01-03,,-0.005294,,,,,,,-0.015509,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.004372,,,,,,,,,,,,,,,,,,,,0.044173,,0.001396,,0.002396,-0.017317,-0.034213,0.001675
1980-01-04,,0.012279,,,,,,,0.017255,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.004391,,,,,,,,,,,,,,,,,,,,0.033839,,-0.014684,,0.006773,0.028012,0.027159,-0.001672
1980-01-07,,0.002119,,,,,,,0.005529,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001100,,,,,,,,,,,,,,,,,,,,0.037644,,-0.021268,,0.009445,-0.026609,0.013745,0.000000
1980-01-08,,0.019638,,,,,,,0.015610,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.006572,,,,,,,,,,,,,,,,,,,,-0.039435,,-0.015227,,0.012703,0.005257,0.011143,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-18,0.010765,0.008349,0.008070,0.007903,7.950891e-03,0.014141,0.003351,0.005968,0.005801,0.005045,0.008415,0.000800,0.011241,-0.005181,0.010972,0.002485,-0.004948,0.001622,0.000071,0.003310,0.013581,-0.002851,-0.001227,-0.011574,0.019471,0.005234,-0.006590,0.011766,-0.003808,0.000094,-0.000869,0.003857,0.000512,-0.003335,-0.001711,-0.000510,-0.002088,0.000000,-0.001965,-0.007271,-0.003113,-0.010143,-0.004219,0.002514,0.000631,0.001080,0.001027,-0.001285,-0.000220,-0.001788,-0.000328,-0.005337,-0.004785,0.000494,0.002955,0.005254,-0.008201,-0.000499,-0.060382,0.007525,0.020994,0.009968,-0.007432,0.002248,0.030872,0.004185,0.005150
2024-01-19,-0.004251,0.012034,-0.000480,-0.004499,9.825071e-03,0.019017,0.007539,-0.004210,0.009561,-0.002519,-0.002303,0.003192,-0.001118,0.010471,0.011035,0.000335,0.000000,0.015971,-0.000212,0.000570,0.005627,0.003088,-0.004101,-0.001618,0.022354,0.008812,0.008952,0.012772,-0.001494,-0.000708,-0.001160,0.001114,0.003064,-0.000136,0.001322,-0.000766,0.000373,-0.001066,-0.000703,0.000781,-0.000271,0.002731,-0.002300,0.005463,0.000866,0.005114,-0.002355,0.002798,0.000073,0.005145,0.001801,0.003561,0.000346,0.001481,0.005366,-0.004297,0.014532,0.001663,-0.066112,0.003808,-0.009064,-0.002574,0.001408,0.028039,0.023003,0.028897,0.013237
2024-01-22,0.005597,0.002359,0.007711,-0.021466,-1.027010e-07,0.001146,-0.000079,0.014528,0.021008,0.009058,0.000132,0.015222,0.007803,0.007010,-0.006643,0.004149,,0.004456,0.004083,0.002957,0.007599,0.003315,-0.004805,-0.005033,0.004323,-0.004283,0.004446,0.001902,0.004558,0.000472,0.002029,0.003838,0.002292,0.001499,0.003182,0.001702,0.003501,0.000571,0.003369,0.005972,0.003924,0.006426,0.004984,-0.003639,0.000551,-0.003025,0.000475,-0.000825,0.000954,-0.002097,-0.005002,0.000940,0.000086,-0.012163,-0.004672,-0.008650,0.002252,-0.001330,-0.039604,-0.003496,0.020614,-0.003443,-0.003834,-0.003928,-0.002970,0.038347,0.005478
2024-01-23,-0.003436,0.002864,-0.002922,0.021989,8.589888e-03,0.004158,0.004497,0.001388,-0.003412,-0.010410,-0.004125,-0.001964,-0.003337,0.005493,-0.000457,0.000067,-0.021204,0.001375,-0.000281,0.003061,-0.000783,0.002833,0.011630,0.001951,0.004402,-0.002318,-0.005233,0.010082,-0.003645,-0.000331,-0.001304,-0.008199,-0.005526,0.000612,-0.004426,-0.001446,-0.004099,-0.000190,-0.002526,-0.007536,-0.003653,-0.010402,0.003070,-0.000228,-0.001969,0.000471,0.002750,-0.003628,-0.002571,-0.001050,0.000000,-0.002405,-0.002508,0.007465,-0.007835,0.008171,-0.007677,0.000000,0.012887,0.001778,-0.005217,0.005032,0.034542,0.015115,0.010638,0.003641,0.000000


In [33]:
windows = [5, 10, 20, 40, 60, 100, 180, 240, 360, 480]
window_m = [10, 30, 60, 100, 180]
windows_test = [9]
assets = df_read.columns
##df_2 = add_features(df_read, windows)
df = add_features(df_read, windows)

In [34]:
df = transform_and_pivot_df(df, dateCol)
df

Unnamed: 0,todate,asset,VaR_10,VaR_100,VaR_180,VaR_20,VaR_240,VaR_360,VaR_40,VaR_480,VaR_5,VaR_60,avgreturn_10,avgreturn_100,avgreturn_180,avgreturn_20,avgreturn_240,avgreturn_360,avgreturn_40,avgreturn_480,avgreturn_5,avgreturn_60,momentum_10,momentum_100,momentum_180,momentum_20,momentum_240,momentum_360,momentum_40,momentum_480,momentum_5,momentum_60,skew_10,skew_100,skew_180,skew_20,skew_240,skew_360,skew_40,skew_480,skew_5,skew_60,volatility_10,volatility_100,volatility_180,volatility_20,volatility_240,volatility_360,volatility_40,volatility_480,volatility_5,volatility_60,MACRO_8,MACRO_0,MACRO_1,MACRO_2,MACRO_3,MACRO_4,MACRO_5,MACRO_6,MACRO_7
0,1980-01-03,EQ_1,,,,,,,,,-0.018303,,,,,,,,,,-0.012141,,,,,,,,,,-0.024282,,,,,,,,,,,,,,,,,,,,0.009683,,,0.044173,,0.001396,,0.002396,-0.017317,-0.034213,0.001675
1,1980-01-03,EQ_8,,,,,,,,,-0.018175,,,,,,,,,,-0.016913,,,,,,,,,,-0.033825,,,,,,,,,,,,,,,,,,,,0.001984,,,0.044173,,0.001396,,0.002396,-0.017317,-0.034213,0.001675
2,1980-01-04,EQ_1,,,,,,,,,-0.017618,,,,,,,,,,-0.004001,,,,,,,,,,-0.012002,,,,,,,,,,0.368768,,,,,,,,,,0.015674,,,0.033839,,-0.014684,,0.006773,0.028012,0.027159,-0.001672
3,1980-01-04,EQ_8,,,,,,,,,-0.018035,,,,,,,,,,-0.005523,,,,,,,,,,-0.016570,,,,,,,,,,1.692898,,,,,,,,,,0.019777,,,0.033839,,-0.014684,,0.006773,0.028012,0.027159,-0.001672
4,1980-01-04,FI_11,,,,,,,,,-0.004390,,,,,,,,,,-0.004381,,,,,,,,,,-0.008762,,,,,,,,,,,,,,,,,,,,0.000014,,,0.033839,,-0.014684,,0.006773,0.028012,0.027159,-0.001672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516555,2024-01-24,FXE_0,-0.013816,-0.013255,-0.013552,-0.015094,-0.013094,-0.015543,-0.014970,-0.016625,-0.009632,-0.012926,-0.001469,0.000110,0.000037,-0.001695,0.000311,0.000342,-0.000639,0.000218,0.000494,0.000241,-0.011753,0.010443,0.006360,-0.028819,0.071481,0.117869,-0.022992,0.100473,0.002469,0.013259,-0.954557,-0.252313,-0.277540,-0.906392,-0.340682,-0.278488,-0.412092,-0.233785,-1.496006,-0.266288,0.007897,0.006779,0.007056,0.007060,0.007292,0.008924,0.007224,0.009727,0.007613,0.007058,0.077863,-0.004837,0.009681,0.004011,0.00048,0.032585,0.029895,-0.018139,0.023889
516556,2024-01-24,FXE_1,-0.015158,-0.013868,-0.013179,-0.010652,-0.013179,-0.012409,-0.012797,-0.012163,-0.007202,-0.012319,-0.002559,0.000263,0.000290,-0.001083,0.000514,0.000672,-0.000165,0.000585,0.000452,0.000865,-0.020470,0.024990,0.049641,-0.018411,0.118811,0.232426,-0.005935,0.269746,0.002260,0.047560,-1.097757,-0.294224,-0.402596,-1.135153,-0.600061,-0.604300,-0.932147,-0.615080,-0.576031,-0.532774,0.008292,0.007708,0.007083,0.006661,0.007240,0.006832,0.006888,0.006912,0.006350,0.007175,0.077863,-0.004837,0.009681,0.004011,0.00048,0.032585,0.029895,-0.018139,0.023889
516557,2024-01-24,FXE_2,-0.013177,-0.014512,-0.014599,-0.014628,-0.015116,-0.015720,-0.014690,-0.015907,-0.007779,-0.014512,-0.001711,0.000200,0.000152,-0.001679,-0.000052,-0.000167,-0.000538,-0.000417,0.001622,-0.000268,-0.013687,0.019007,0.025926,-0.028543,-0.012041,-0.057881,-0.019384,-0.192237,0.008109,-0.014715,-0.306785,0.168480,-0.094806,0.036432,-0.049045,0.051639,0.284130,-0.030822,-0.699723,0.398629,0.008352,0.008911,0.009385,0.008178,0.009218,0.009537,0.009507,0.009683,0.007628,0.009432,0.077863,-0.004837,0.009681,0.004011,0.00048,0.032585,0.029895,-0.018139,0.023889
516558,2024-01-24,FXE_3,-0.008017,-0.014883,-0.017286,-0.012194,-0.017286,-0.020238,-0.014293,-0.029202,-0.008096,-0.015367,0.000754,0.001602,-0.000022,0.001760,-0.000157,-0.000536,0.000629,0.000992,0.000363,0.001471,0.006030,0.152231,-0.003820,0.029924,-0.036201,-0.185452,0.022654,0.457477,0.001813,0.080896,0.557180,0.469147,0.431342,0.751401,0.660836,0.168036,0.443771,-0.291931,0.895933,0.547749,0.007864,0.013486,0.012715,0.011333,0.017328,0.017716,0.009923,0.022318,0.009255,0.015388,0.077863,-0.004837,0.009681,0.004011,0.00048,0.032585,0.029895,-0.018139,0.023889


In [50]:
target_days = 9
#lägg till så man kan räkna sharpe för t.ex 7 dagar genom att kalkylera i add_y_col funktioen istället
return_column_shift = 'avgreturn'
volatility_column_shift = 'volatility'
#df = add_y_col(df, dateCol, target_days, return_column_shift, volatility_column_shift)
df_combined = combined_process(df_read, df, dateCol, target_days=9)


           todate  asset  avgreturn_9  volatility_9    VaR_10   VaR_100  \
516540 2024-01-24   FI_5          NaN           NaN -0.004731 -0.003365   
516541 2024-01-24   FI_6          NaN           NaN -0.005349 -0.006664   
516542 2024-01-24   FI_7          NaN           NaN -0.003560 -0.003589   
516543 2024-01-24   FI_8          NaN           NaN -0.004894 -0.006479   
516544 2024-01-24   FI_9          NaN           NaN -0.002238 -0.001820   
516545 2024-01-24  FXD_0          NaN           NaN -0.012331 -0.009373   
516546 2024-01-24  FXD_1          NaN           NaN -0.006629 -0.007090   
516547 2024-01-24  FXD_2          NaN           NaN -0.005483 -0.005425   
516548 2024-01-24  FXD_3          NaN           NaN -0.003052 -0.006340   
516549 2024-01-24  FXD_4          NaN           NaN -0.006259 -0.008098   
516550 2024-01-24  FXD_5          NaN           NaN -0.013693 -0.009011   
516551 2024-01-24  FXD_6          NaN           NaN -0.012686 -0.013006   
516552 2024-01-24  FXD_7 

In [46]:
print(df_combined)

Empty DataFrame
Columns: [todate, asset, VaR_10, VaR_100, VaR_180, VaR_20, VaR_240, VaR_360, VaR_40, VaR_480, VaR_5, VaR_60, avgreturn_10, avgreturn_100, avgreturn_180, avgreturn_20, avgreturn_240, avgreturn_360, avgreturn_40, avgreturn_480, avgreturn_5, avgreturn_60, momentum_10, momentum_100, momentum_180, momentum_20, momentum_240, momentum_360, momentum_40, momentum_480, momentum_5, momentum_60, skew_10, skew_100, skew_180, skew_20, skew_240, skew_360, skew_40, skew_480, skew_5, skew_60, volatility_10, volatility_100, volatility_180, volatility_20, volatility_240, volatility_360, volatility_40, volatility_480, volatility_5, volatility_60, MACRO_8, MACRO_0, MACRO_1, MACRO_2, MACRO_3, MACRO_4, MACRO_5, MACRO_6, MACRO_7, Y]
Index: []


In [None]:

X_train, X_eval, X_test, y_train, y_eval, y_test, X_train_eval, y_train_eval = prepare_training_dataset(df, dateCol, target_col='Y', shuffle=False, train_split=0.25, eval_split=0.25)

In [None]:
param_grid_alpha = {'alpha': [0.1, 1.0, 10.0]}
ridge_best, grid_search = optimize_and_train_ridge(X_train, y_train, X_train_eval, y_train_eval, param_grid_alpha)

In [None]:
param_grid_xgb = {
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_best, best_params = optimize_and_train_xgb(X_train, y_train, X_eval, y_eval, param_grid_xgb)


In [None]:

param_grid_lgb = {
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 63, 127, 255]
}

# Call the function with your datasets and hyperparameter grid
lgb_best, best_params = optimize_and_train_lgb(X_train, y_train, X_eval, y_eval, param_grid_lgb)

In [None]:
NN_model, history, X_test_scaled = train_and_evaluate_NN(X_train_eval, y_train_eval, X_eval, y_eval, X_test, y_test)


In [None]:

# Store predictions in a dictionary
predictions = {
    "Ridge Classifier": ridge_best.predict(X_test),
    "XGBoost Classifier": xgb_best.predict(X_test),
    "LightGBM Classifier": lgb_best.predict(X_test),
}   

# Iterate through the dictionary and evaluate each model
for model_name, y_pred in predictions.items():
    print(model_name + ":")
    evaluate_model_performance(y_test, y_pred)
    print("\n")


In [None]:
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, confusion_matrix

y_pred_nn = NN_model.predict(X_test_scaled)
test_loss, test_accuracy = NN_model.evaluate(X_test_scaled, y_test)

y_pred_labels = (y_pred_nn > 0.5).astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred_labels, average='weighted'  # Use 'weighted' for multiclass classification
)
conf_matrix = confusion_matrix(y_test, y_pred_labels)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
mse = mean_squared_error(y_test, y_pred_labels)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

In [None]:
y_pred = NN_model.predict(X_test_scaled)


In [None]:
labels = np.argmax(y_pred, axis=1)
# print amount of 1s and 0s
print(y_pred)

In [None]:
y_pred_lgb = lgb_best.predict(X_test)
y_pred_lgb

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_labels)
