In [1]:
#--------------------------------------------------------------------------------
# Module 1: Check coverage of remaining columns after cleanse
#--------------------------------------------------------------------------------
import pandas as pd

data = pd.read_csv("Cleaned Data/RT.IRS_Clean_v2.csv")

# Calculate the percentage of missing values for each column
missing_percentage = data.isnull().sum() / len(data) * 100

# Display the columns with their missing values percentage
missing_percentage.sort_values(ascending=False)

Leg 2 Floating Index          7.512991
Leg1 Reset Frequency          7.505672
Leg 2 Reset Frequency         7.480056
Collateralization             3.220376
Leg 2 Day Count Convention    3.004465
Leg 2 Payment Frequency       3.004465
Leg 2 Type                    3.004465
Leg 2 Notional Currency       3.004465
Leg 1 Payment Frequency       2.876381
Leg 1 Type                    2.876381
Execution Venue               2.287199
Future Value Notional         1.496743
Leg 2 Notional                0.076850
Leg 1 Spread                  0.047574
Leg 2 Spread                  0.047574
Leg 1 Fixed Rate              0.025617
Leg 2 Fixed Rate              0.021957
Maturity Date                 0.000000
Product                       0.000000
Event                         0.000000
Cleared                       0.000000
Contract Type                 0.000000
Rpt ID                        0.000000
Asset Class                   0.000000
Option Premium                0.000000
Option Strike Price      

In [2]:
#--------------------------------------------------------------------------------
# Module 2: Fill missing values
#   1. Fill numerical columns with their median (since median is less sensitive to outliers than mean).
#   2. Fill categorical columns with their mode (most frequent value).
#--------------------------------------------------------------------------------

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Fill missing values in numerical columns with their median
for column in numerical_columns:
    median_value = data[column].median()
    data[column].fillna(median_value, inplace=True)

# Fill missing values in categorical columns with their mode
for column in categorical_columns:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

# Check if there are any remaining missing values
remaining_missing = data.isnull().sum().sum()

remaining_missing


0

In [3]:
#--------------------------------------------------------------------------------
# Module 3: Date/timestamp management
#   1. Convert "Maturity Date" and "Execution Timestamp" columns to datetime format.
#   2. Extract relevant features from these datetime columns, such as year, month, day, and so on, which can be used as input to the model.
#--------------------------------------------------------------------------------

# Convert "Maturity Date" and "Execution Timestamp" columns to datetime format
data["Maturity Date"] = pd.to_datetime(data["Maturity Date"], errors='coerce')
data["Execution Timestamp"] = pd.to_datetime(data["Execution Timestamp"], errors='coerce')

# Extract features from "Maturity Date"
data["Maturity_Year"] = data["Maturity Date"].dt.year
data["Maturity_Month"] = data["Maturity Date"].dt.month
data["Maturity_Day"] = data["Maturity Date"].dt.day

# Extract features from "Execution Timestamp"
data["Execution_Year"] = data["Execution Timestamp"].dt.year
data["Execution_Month"] = data["Execution Timestamp"].dt.month
data["Execution_Day"] = data["Execution Timestamp"].dt.day
data["Execution_Hour"] = data["Execution Timestamp"].dt.hour
data["Execution_Minute"] = data["Execution Timestamp"].dt.minute

# Drop the original datetime columns
data.drop(["Maturity Date", "Execution Timestamp"], axis=1, inplace=True)

# Display the first few rows with the new features
data.head()


Unnamed: 0,Event,Rpt ID,Asset Class,Option Premium,Option Strike Price,Leg 2 Designated Maturity,Leg 1 Notional,Leg 1 Day Count Convention,Leg 1 Designated Maturity,Settlement Currency,...,Leg1 Reset Frequency,Leg 2 Floating Index,Maturity_Year,Maturity_Month,Maturity_Day,Execution_Year,Execution_Month,Execution_Day,Execution_Hour,Execution_Minute
0,Allocation,IRS48266001,Interest Rate,0.0,0.0,10.1444Y,990000.0,ACT/360,10.1444Y,USD,...,3,USD.USD-LIBOR-BBA.3M.USD-LIBOR-BBA,2023.0,6.0,19.0,2022,10,11,19,43
1,Allocation,IRS48265989,Interest Rate,0.0,0.0,10.1444Y,990000.0,30/360,10.1444Y,USD,...,MTH,USD-LIBOR-BBA,2023.0,6.0,19.0,2022,10,11,19,43
2,Allocation,IRS48225527,Interest Rate,0.0,0.0,10.1444Y,990000.0,30/360,10.1444Y,USD,...,MTH,USD-LIBOR-BBA,2023.0,6.0,19.0,2022,10,3,21,8
3,Allocation,IRS48225530,Interest Rate,0.0,0.0,10.1444Y,990000.0,ACT/360,10.1444Y,USD,...,3,USD.USD-LIBOR-BBA.3M.USD-LIBOR-BBA,2023.0,6.0,19.0,2022,10,3,21,8
4,Amendment,IRS13003270,Interest Rate,100000.0,0.0,0.0Y,10000000.0,ACT/365,0.0Y,USD,...,6D,USD.USD-LIBOR-BBA.3M.USD-LIBOR-BBA,2016.0,4.0,4.0,2014,7,8,15,25


In [7]:
#--------------------------------------------------------------------------------
# Module 4: Normalization for machine learning suitability
#   1. Normalize numerical features to ensure they have a similar scale.
#   2. One-hot encode categorical features to convert them into a format suitable for the machine learning model.
#--------------------------------------------------------------------------------

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Normalize numerical features
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, drop_first=True)

# Display the shape and first few rows of the transformed dataset
data_encoded_shape = data_encoded.shape
data_encoded_head = data_encoded.head()

data_encoded_shape, data_encoded_head


((27326, 34096),
    Option Premium  Option Strike Price  Leg 1 Notional  Upfront Payment  \
 0             0.0                  0.0    9.979034e-07         0.496424   
 1             0.0                  0.0    9.979034e-07         0.496424   
 2             0.0                  0.0    9.979034e-07         0.496424   
 3             0.0                  0.0    9.979034e-07         0.496424   
 4             1.0                  0.0    1.007988e-05         0.496424   
 
    Leg 2 Fixed Rate  Leg 1 Fixed Rate  Leg 2 Spread  Leg 1 Spread  \
 0          0.013324          0.002164           0.0      0.000000   
 1          0.003358          0.008587           0.0      0.000000   
 2          0.003358          0.008587           0.0      0.000000   
 3          0.013324          0.002164           0.0      0.000000   
 4          0.003358          0.644479           0.0      0.753012   
 
    Leg 2 Notional  Future Value Notional  ...  \
 0    9.979084e-07                    0.0  ...   
 1 

In [13]:
#--------------------------------------------------------------------------------
# Module 5: Pickle that big boi data for later use in GAN model
#--------------------------------------------------------------------------------

import os
import pickle

# Ensure the directory "Processed data" exists
if not os.path.exists("Processed data"):
    os.makedirs("Processed data")

# Save the DataFrame as a pickled file
with open("Processed data/data_encoded.pkl", "wb") as file:
    pickle.dump(data_encoded, file)
