<a href="https://colab.research.google.com/github/Owenliu50377/UBS-Contest/blob/main/ARIMA%2BLasso.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
df = pd.read_csv('../data/Final_dataset.csv')

In [None]:
# Convert Tenor Bucket and Expiry Bucket to numerical values
bucket_mapping = {'10y': 10, '15y': 15, '18M': 1.5,
                  '1y': 1, '2y': 2, '3y': 3,
                  '4y': 4, '5y': 5, '8y': 8}
df['Tenor Bucket'] = df['Tenor Bucket'].map(bucket_mapping)
df['Expiry Bucket'] = df['Expiry Bucket'].map(bucket_mapping)

# Convert Expiry Date to Expiry Days
df['Expiry Date'] = pd.to_datetime(df['Expiry Date'])
df['Value Date'] = pd.to_datetime(df['Value Date'])
df['Expiry Days'] = (df['Expiry Date'] - df['Value Date']).dt.days

# Multiply lower_bound and upper_bound by 100
df['lower_bound'] *= 100
df['upper_bound'] *= 100

In [None]:
# Select Features and Target Variable
X_columns = ['Zero Rate Shock', 'TV', 'Expiry Bucket', 'Expiry Days',
             'Tenor Bucket', 'Vols', 'pay_frequency', 'maturity',
             'lower_bound', 'upper_bound', 'Daily_Diff_STD', 'Weekly_Diff_STD',
             'Monthly_Diff_STD', 'Mean_CMS', 'STD_CMS',
             'Max_CMS', 'Min_CMS', 'Proportion_Within_Range']
Y_column = 'Vega'

# Split into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(df[X_columns], df[Y_column],
                                                    test_size=0.2, random_state=42)

In [None]:
# Initialize Lasso Model
lasso_model = Lasso(alpha=0.1, max_iter=10000)

# Train Lasso Model
lasso_model.fit(X_train, y_train)

# Print Selected Features and Corresponding Coefficients
selected_features = [feature for feature, coef in zip(X_columns, lasso_model.coef_) if coef != 0]
lasso_coefs = dict(zip(selected_features, lasso_model.coef_))
print("Lasso selected features and corresponding coefficients:")
print(lasso_coefs)

# Rebuild Data Set with Selected Features
X_selected = df[selected_features + ['Zero Rate Shock', 'Expiry Bucket',
                                     'Tenor Bucket', 'Value Date',
                                     'Trade Name', 'Vega']].set_index('Value Date')

In [None]:
# Grouping Column Name
group_by_column = 'Trade Name'

# Create New Grouping Column
df['Group'] = (df['Trade Name'] + '_' +
               df['Zero Rate Shock'].astype(str) + '_' +
               df['Expiry Bucket'].astype(str) + '_' +
               df['Tenor Bucket'].astype(str))

In [None]:
# Group-wise Training of ARIMAX Models and Compute MSE
# Initialize Lists to Store Model Coefficients and MSE
model_coeffs = []
mse_list = []

# Group-wise Processing
for name, group in df.groupby('Group'):
    # print(f"Forecasting for group: {name}")

    # Split into Training and Testing Sets
    train_size = int(len(group) * 0.8)
    train, test = group.iloc[:train_size], group.iloc[train_size:]

    endog_train = train['Vega']
    exog_train = train[selected_features]
    endog_test = test['Vega']
    exog_test = test[selected_features]

    # Fit ARIMAX Model
    model = SARIMAX(endog_train, exog=exog_train, order=(1, 1, 1))
    model_fit = model.fit(maxiter=10000, disp=False)

    # Store Model Coefficients
    model_coeffs.append(model_fit.params)

    # Forecast Test Set
    forecast = model_fit.forecast(steps=len(test), exog=exog_test)

    # Compute MSE
    mse = mean_squared_error(endog_test, forecast)
    mse_list.append(mse)
    # print(f"MSE for group {name}: {mse}")

In [None]:
# Compute Average Coefficients
average_coeffs = pd.DataFrame(model_coeffs).mean()
print("Average coefficients for ARIMAX model:")
print(average_coeffs)

# Compute Average MSE across All Groups
average_mse = np.mean(mse_list)
print(f"Average MSE for ARIMAX model: {average_mse}")