In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.metrics import MeanSquaredError, Accuracy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb
import re
import pickle
import lightgbm as lgb
pd.set_option('display.max_columns', None)

In [2]:

def load_and_preprocess_data(filename, date_col, start_date, seperator, fill, lim):
    df_read = pd.read_csv(filename, sep=seperator)
    
    # Convert the date column to datetime and set it as the index
    df_read[date_col] = pd.to_datetime(df_read[date_col])
    df_read.set_index(date_col, inplace=True)
    
    # Data cleaning: replace commas with periods and convert to float
    for column in df_read.columns:
        if df_read[column].dtype == 'object':
            df_read[column] = df_read[column].str.replace(',', '.').astype(float)
    
    # Start dataset from start date
    df_filtered = df_read[start_date:]

    # Fill missing values
    df_filtered.fillna(fill, limit=lim)
    
    return df_filtered


In [3]:


def add_features(df, windows):

    feature_cols = [col for col in df.columns if 'macro' not in col.lower()]
    macro_cols = [col for col in df.columns if 'MACRO' in col.upper()]
    # Initialize a dictionary to hold all the new feature data
    features_dict = {}


    # Perform rolling calculations for each window size
    for w in windows:
        for col in feature_cols:
            # Create unique feature names for each statistic and window size
            features_dict[f'{col}_VaR_{w}'] = df[col].rolling(window=w, min_periods=int(w//2)).quantile(0.05)
            features_dict[f'{col}_momentum_{w}'] = df[col].rolling(window=w, min_periods=int(w//2)).sum()
            features_dict[f'{col}_avgreturn_{w}'] = df[col].rolling(window=w, min_periods=int(w//2)).mean()
            features_dict[f'{col}_skew_{w}'] = df[col].rolling(window=w, min_periods=int(w//2)).skew()
            features_dict[f'{col}_volatility_{w}'] = df[col].rolling(window=w, min_periods=int(w//2)).std()

    # Convert the dictionary of Series to a DataFrame
    features_df = pd.DataFrame(features_dict, index=df.index)

    # Concatenate 'MACRO' columns to the features DataFrame
    if macro_cols:
        macro_df = df[macro_cols]
        features_df = pd.concat([features_df, macro_df], axis=1)

    return features_df



In [4]:

def transform_and_pivot_df(df, date_col):
    # Reset the index to make the date a regular column
    df_reset = df.reset_index()
    
    # Melt the DataFrame to long format
    long_df = df_reset.melt(id_vars=date_col, var_name='metric', value_name='value')
    
    # Split the 'metric' column to extract components
    split_metrics = long_df['metric'].str.split('_', expand=True)
    
    # Identify 'MACRO' rows
    macro_mask = split_metrics[0] == 'MACRO'
    
    # For non-'MACRO' metrics, define 'asset' and 'metric_type'
    long_df['asset'] = split_metrics[0] + '_' + split_metrics[1]
    long_df['metric_type'] = split_metrics[2] + '_' + split_metrics[3]

    # Reset index on the left-hand side DataFrame slice to ensure alignment
    lhs = long_df.loc[macro_mask, 'metric_type'].reset_index(drop=True)

    # Reset index on the right-hand side Series to ensure alignment
    rhs = (split_metrics.loc[macro_mask, 0] + '_' + split_metrics.loc[macro_mask, 1]).reset_index(drop=True)

    # Assign the values after ensuring both sides have the same length
    lhs = rhs

    # Assign the modified Series back to the original DataFrame (if needed)
    long_df.loc[macro_mask, 'metric_type'] = lhs.values
    
    # For 'MACRO' metrics, adjust 'metric_type' and 'asset'
    long_df.loc[macro_mask, 'metric_type'] = split_metrics.loc[macro_mask, 0] + '_' + split_metrics.loc[macro_mask, 1]
    long_df.loc[macro_mask, 'asset'] = 'MACRO'
    
    # Remove 'MACRO' placeholder rows
    long_df = long_df[long_df['asset'] != 'MACRO']
    
    # Pivot the DataFrame back to wide format
    final_df = long_df.pivot_table(index=[date_col, 'asset'], columns='metric_type', values='value').reset_index()
    
    # Handle 'MACRO' metrics separately
    macro_df = df.filter(regex='^MACRO').copy()
    macro_df[date_col] = df_reset[date_col]
    
    # Merge 'MACRO' metrics back into the final DataFrame
    if date_col in macro_df.columns:
        macro_df = macro_df.drop(columns=date_col)
    
    macro_df = macro_df.reset_index()
    final_df = pd.merge(final_df, macro_df, on=date_col, how='left')
    
    return final_df


In [5]:
def add_y_col(df, date_col, target_days, return_col, volatility_col):
    # Shift the specified return and volatility columns by the target number of days
    df[f'{return_col}_shifted'] = df.groupby('asset')[return_col].shift(-target_days)
    df[f'{volatility_col}_shifted'] = df.groupby('asset')[volatility_col].shift(-target_days)
    
    # Drop rows with NaN values that result from the shift operation
    df = df.dropna()
    
    # Calculate the Sharpe ratio by dividing the shifted return by the shifted volatility
    df['sharpe_ratio'] = df[f'{return_col}_shifted'] / df[f'{volatility_col}_shifted']
    
    # Calculate the mean Sharpe ratio for each date and merge it back into the DataFrame
    sharpe_ratio_mean = df.groupby(date_col)['sharpe_ratio'].mean().rename('sharpe_ratio_mean')
    df = df.merge(sharpe_ratio_mean, on=date_col)
    
    # Create a new binary column 'Y', indicating whether the Sharpe ratio is above the mean for its date
    df['Y'] = np.where(df['sharpe_ratio'] > df['sharpe_ratio_mean'], 1, 0)

    df = df.drop(columns=['sharpe_ratio', 'sharpe_ratio_mean', f'{return_col}_shifted', f'{volatility_col}_shifted'])

    
    return df

In [18]:


def prepare_training_dataset(df, date_col, target_col='Y', shuffle=False, train_split=0.25, eval_split=0.25):
    # Encode categorical variables if necessary (assuming 'asset' is the only categorical variable)
    df_encoded = pd.get_dummies(df, columns=['asset'], drop_first=True)
    
    # Separate features and target variable
    X = df.drop(columns=['Y', 'asset'])

    y = df['Y']

    # Convert date column to datetime if not already done
    X[date_col] = pd.to_datetime(X[date_col])

    if shuffle:
        # Split the data randomly
        train_size = 1 - train_split
        X_temp, X_train, y_temp, y_train = train_test_split(X, y, train_size=train_size)
        X_eval, X_test, y_eval, y_test = train_test_split(X_temp, y_temp, test_size=(2/3))
    else:
        # Split the data sequentially
        train_end_idx = int(len(X) * train_split)
        eval_end_idx = train_end_idx + int(len(X) * eval_split)

        X_train = X.iloc[:train_end_idx]
        y_train = y.iloc[:train_end_idx]
        X_eval = X.iloc[train_end_idx:eval_end_idx]
        y_eval = y.iloc[train_end_idx:eval_end_idx]
        X_test = X.iloc[eval_end_idx:]
        y_test = y.iloc[eval_end_idx:]

    # Drop the date column
    X_train = X_train.drop(date_col, axis=1)
    X_eval = X_eval.drop(date_col, axis=1)
    X_test = X_test.drop(date_col, axis=1)

    # Combine training and evaluation sets for the final model training
    X_train_eval = pd.concat([X_train, X_eval])
    y_train_eval = pd.concat([y_train, y_eval])


    return X_train, X_eval, X_test, y_train, y_eval, y_test, X_train_eval, y_train_eval




In [7]:


def optimize_and_train_ridge(X_train, y_train, X_train_eval, y_train_eval, param_grid, scoring='accuracy', cv=5):

    model = RidgeClassifier()

    # Initialize GridSearchCV with the provided model and parameter grid
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=cv)
    
    # Fit GridSearchCV on the training set
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters and the accuracy on the evaluation set
    print("Best parameters:", grid_search.best_params_)
    print("Best accuracy on evaluation set:", grid_search.best_score_)
    
    # Retrain the model with the best parameters on the combined training and evaluation sets
    model_best = model.__class__(**grid_search.best_params_)
    model_best.fit(X_train_eval, y_train_eval)

    return model_best, grid_search





In [8]:


def evaluate_model_performance(y_true, y_pred):
 

    conf_matrix = confusion_matrix(y_true, y_pred)


    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    # Print the performance metrics
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")

    return conf_matrix, precision, recall, f1, mse, rmse



In [9]:


def optimize_and_train_xgb(X_train, y_train, X_eval, y_eval, param_grid, scoring='accuracy', cv=5, n_jobs=-1, early_stopping_rounds=10):

    # Initialize the XGBoost model
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Perform grid search
    grid_search = GridSearchCV(xgb_model, param_grid, scoring=scoring, cv=cv, n_jobs=n_jobs)
    grid_search.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], early_stopping_rounds=early_stopping_rounds, verbose=False)

    # Extract best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    # Retrain the model with the best parameters on the combined training and evaluation set
    xgb_best = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
    xgb_best.fit(pd.concat([X_train, X_eval]), pd.concat([y_train, y_eval]))

    return xgb_best, best_params

# Define param_grid


In [23]:


def optimize_and_train_lgb(X_train, y_train, X_eval, y_eval, param_grid, scoring='accuracy', cv=5, n_jobs=-1):

    # Initialize the LightGBM model
    lgb_model = lgb.LGBMClassifier()

    # Perform grid search
    grid_search = GridSearchCV(lgb_model, param_grid, scoring=scoring, cv=cv, n_jobs=n_jobs)
    grid_search.fit(X_train, y_train)

    # Extract best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    # Retrain the model with the best parameters on the combined training and evaluation set
    lgb_best = lgb.LGBMClassifier(**best_params)
    lgb_best.fit(pd.concat([X_train, X_eval]), pd.concat([y_train, y_eval]), eval_set=[(X_eval, y_eval)])

    return lgb_best, best_params




In [11]:


def train_and_evaluate_NN(X_train_eval, y_train_eval, X_eval, y_eval, X_test, y_test, epochs=50, batch_size=32):

    # Initialize the scaler and scale the data
    scaler = StandardScaler()
    X_train_eval_scaled = scaler.fit_transform(X_train_eval)
    X_eval_scaled = scaler.transform(X_eval)
    X_test_scaled = scaler.transform(X_test)

    # Define the model architecture
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_eval_scaled.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

    # Train the model
    history = model.fit(
        X_train_eval_scaled, y_train_eval,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_eval_scaled, y_eval)
    )
    return model, history, X_test_scaled




In [12]:
filename = 'all_data_anonymized.csv'
dateCol = 'todate'
start_date = '1980-01-01'
seperator = ';'
fill = 0
lim = 5
df = load_and_preprocess_data(filename, dateCol, start_date, seperator, fill, lim)


  df_read = pd.read_csv(filename, sep=seperator)


In [13]:
windows = [5, 10, 20, 40, 60, 100, 180, 240, 360, 480]
window_m = [10, 30, 60, 100, 180]
assets = df.columns

df = add_features(df, window_m)

In [14]:
df = transform_and_pivot_df(df, dateCol)


In [15]:
target_days = 9
return_column_shift = 'avgreturn_10'
volatility_column_shift = 'volatility_10'
df = add_y_col(df, dateCol, target_days, return_column_shift, volatility_column_shift)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sharpe_ratio'] = df[f'{return_col}_shifted'] / df[f'{volatility_col}_shifted']


In [19]:

 X_train, X_eval, X_test, y_train, y_eval, y_test, X_train_eval, y_train_eval = prepare_training_dataset(df, dateCol, target_col='Y', shuffle=False, train_split=0.25, eval_split=0.25)

In [20]:
param_grid_alpha = {'alpha': [0.1, 1.0, 10.0]}
ridge_best, grid_search = optimize_and_train_ridge(X_train, y_train, X_train_eval, y_train_eval, param_grid_alpha)

Best parameters: {'alpha': 10.0}
Best accuracy on evaluation set: 0.5245591820194883


In [21]:
param_grid_xgb = {
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_best, best_params = optimize_and_train_xgb(X_train, y_train, X_eval, y_eval, param_grid_xgb)




Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [24]:

param_grid_lgb = {
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [7, 15, 31, 63]
}

# Call the function with your datasets and hyperparameter grid
lgb_best, best_params = optimize_and_train_lgb(X_train, y_train, X_eval, y_eval, param_grid_lgb)

[LightGBM] [Info] Number of positive: 47567, number of negative: 49810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8670
[LightGBM] [Info] Number of data points in the train set: 97377, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.488483 -> initscore=-0.046077
[LightGBM] [Info] Start training from score -0.046077
Best hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'num_leaves': 7}
[LightGBM] [Info] Number of positive: 95466, number of negative: 99288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8670
[LightGBM] [Info] Number of data points in the train set: 194754, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pa

In [25]:
NN_model, history, X_test_scaled = train_and_evaluate_NN(X_train_eval, y_train_eval, X_eval, y_eval, X_test, y_test)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 672us/step - accuracy: 0.5403 - loss: 0.2719 - val_accuracy: 0.5575 - val_loss: 0.2444
Epoch 2/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 635us/step - accuracy: 0.5596 - loss: 0.2449 - val_accuracy: 0.5698 - val_loss: 0.2416
Epoch 3/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 629us/step - accuracy: 0.5710 - loss: 0.2421 - val_accuracy: 0.5755 - val_loss: 0.2404
Epoch 4/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 644us/step - accuracy: 0.5784 - loss: 0.2403 - val_accuracy: 0.5741 - val_loss: 0.2398
Epoch 5/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 629us/step - accuracy: 0.5859 - loss: 0.2383 - val_accuracy: 0.5897 - val_loss: 0.2369
Epoch 6/50
[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 641us/step - accuracy: 0.5919 - loss: 0.2368 - val_accuracy: 0.5971 - val_loss: 0.2344
Epoc

In [39]:

# Store predictions in a dictionary
predictions = {
    "Ridge Classifier": ridge_best.predict(X_test),
    "XGBoost Classifier": xgb_best.predict(X_test),
    "LightGBM Classifier": lgb_best.predict(X_test),
}   

# Iterate through the dictionary and evaluate each model
for model_name, y_pred in predictions.items():
    if model_name == "Neural Network":
        y_pred = y_pred.flatten()
    print(model_name + ":")
    evaluate_model_performance(y_test, y_pred)
    print("\n")


[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 399us/step
Ridge Classifier:
Confusion Matrix:
[[58663 40583]
 [51979 43532]]
Precision: 0.5175295726089283
Recall: 0.4557799625174064
F1 Score: 0.4846959794239141
MSE: 0.4752691815955267
RMSE: 0.6893976947999803


XGBoost Classifier:
Confusion Matrix:
[[56632 42614]
 [49115 46396]]
Precision: 0.5212448039546118
Recall: 0.48576603741977364
F1 Score: 0.5028804309536584
MSE: 0.4709920567681778
RMSE: 0.6862886104024879


LightGBM Classifier:
Confusion Matrix:
[[56965 42281]
 [49269 46242]]
Precision: 0.5223727166950962
Recall: 0.4841536576938782
F1 Score: 0.5025375745786105
MSE: 0.47007296271764304
RMSE: 0.6856186715059932


Neural Network:
Confusion Matrix:
[[99246     0]
 [95511     0]]
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
MSE: 0.4904111277129962
RMSE: 0.7002936010795731




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
 y_pred_labels = np.argmax(NN_model.predict(X_test_scaled),  axis=1) 


[1m6087/6087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 394us/step


In [38]:
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred_labels, average='weighted'  # Use 'weighted' for multiclass classification
)

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Precision: 0.2596808187587403
Recall: 0.5095888722870038
F1-Score: 0.34404177657368107


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
