In [14]:
#--------------------------------------------------------------------------------
# Module 1: Imports RT.IRS_data.csv and cleans it by removing all rows which does  not have 
# data in the "Event"-column. Further it removes all headers and corresponding 
# columns which is empty (no data).
#--------------------------------------------------------------------------------

import pandas as pd
import os

# Define file paths using relative paths
input_file_path = "Categorized Data/RT.IRS_Data.csv"
output_file_path = "Cleaned Data/RT.IRS_Clean_v1.csv"

# Get the current working directory
current_directory = os.getcwd()

# Construct absolute file paths
input_file_path = os.path.join(current_directory, input_file_path)
output_file_path = os.path.join(current_directory, output_file_path)

# Load the CSV file into a pandas DataFrame
data = pd.read_csv(input_file_path)

# Remove rows where the "Event" column is empty
data = data.dropna(subset=['Event'])
print(len(data))

# Remove columns where all values are NaN after removing empty rows
data = data.dropna(axis=1, how='all')

# Remove rows where "Event" column starts with two numbers
data = data[~data['Event'].astype(str).str.match(r'^\d{2}')]

# Save the cleaned data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned and saved to", output_file_path)


  data = pd.read_csv(input_file_path)


27365
Data cleaned and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [15]:
#--------------------------------------------------------------------------------
# Module 2: Sort the rows based on the "Event" column 
#--------------------------------------------------------------------------------

# Sort the rows based on the "Event" column
data = data.sort_values(by='Event')

# Save the cleaned and sorted data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned, sorted, and saved to", output_file_path)

Data cleaned, sorted, and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [16]:
#--------------------------------------------------------------------------------
# Module 3: Define parameters
#--------------------------------------------------------------------------------

# Define the variables from the second sheet
variables_info = {
    "Event": "string",
    "Execution Timestamp": "datetime",
    "Dissemination Time": "datetime",
    "Cleared": "string",
    "Collateralization": "string",
    "End-User Exception": "string",
    "Bespoke": "character",
    "Block/Off facility": "character",
    "Execution Venue": "string",
    "UPI": "string",
    "Fixed Float Swap": "string",
    "Contract Type": "string",
    "Effective Date": "date",
    "Maturity Date": "date",
    "Upfront Payment": "int",
    "Upfront Payment Currency": "string",
    "Upfront Payment Date": "date",
    "Settlement Currency": "string",
    "Leg 1 Type": "string",
    "Leg 1 Fixed Rate": "int",
    "Leg 1 Floating Index": "string",
    "Leg 1 Designated Maturity": "string",
    "Leg 1 Spread": "float",
    "Leg 1 Day Count Convention": "string",
    "Leg 1 Notional": "int",
    "Leg 1 Notional Currency": "string",
    "Leg 1 Payment Frequency": "string",
    "Leg1 Reset Frequency": "string",
    "Leg 2 Type": "string",
    "Leg 2 Fixed Rate": "float",
    "Leg 2 Floating Index": "string",
    "Leg 2 Designated Maturity": "string",
    "Leg 2 Spread": "float",
    "Leg 2 Day Count Convention": "string",
    "Leg 2 Notional": "float",
    "Leg 2 Notional Currency": "string",
    "Leg 2 Payment Frequency": "string",
    "Leg 2 Reset Frequency": "string",
    "Embedded Option": "character",
    "Option Strike Price": "float",
    "Option Type": "string",
    "Option Family": "string",
    "Option Currency": "string",
    "Option Premium": "float",
    "Option Lockout Period": "date",
    "Option Expiration Date": "date",
    "Asset Class": "string",
    "Rpt ID": "string",
    "Prev Rpt ID": "string",
    "Future Value Notional": "float",
    "Contract Subtype": "string"
}

print("Variables defined.")



Variables defined.


In [17]:
#--------------------------------------------------------------------------------
# Module 4: Adjust columns according to variables_info and save to CSV
#--------------------------------------------------------------------------------

for column, data_type in variables_info.items():
    if column in data.columns:
        if data_type == "string":
            data[column] = data[column].astype(str)
        elif data_type == "int":
            # Convert to numeric with NaN for non-convertible values
            data[column] = pd.to_numeric(data[column], errors='coerce')
            # Optional: Fill NaN with a placeholder like 0 or -1
            # data[column] = data[column].fillna(0).astype('Int64')
        elif data_type == "float":
            data[column] = pd.to_numeric(data[column], errors='coerce')
        elif data_type == "datetime":
            data[column] = pd.to_datetime(data[column], errors='coerce')
        elif data_type == "date":
            data[column] = pd.to_datetime(data[column], errors='coerce').dt.date
        # Add more conditions as needed for other data types


# Define the output file path for the cleaned data
output_file_path = os.path.join(current_directory, "Cleaned Data/RT.IRS_Clean_v2.csv")

# Save the adjusted data to a CSV file
data.to_csv(output_file_path, index=False)

print("Data adjusted and saved to", output_file_path)



Data adjusted and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v2.csv


In [18]:
#--------------------------------------------------------------------------------
# Module 5: Display DataFrame rows separated by data types
#--------------------------------------------------------------------------------

# Group columns by their data types
columns_by_type = {}
for col, dtype in variables_info.items():
    if col in data.columns:
        columns_by_type.setdefault(dtype, []).append(col)

# Print the top 5 rows for each data type
for dtype, cols in columns_by_type.items():
    print(f"Data Type: {dtype}")
    print(data[cols].head(5))
    print("\n")


Data Type: string
            Event    Cleared       Collateralization End-User Exception  \
18570  Allocation    Cleared  One-Way Collateralized                nan   
18569  Allocation    Cleared  One-Way Collateralized                nan   
23513  Allocation    Cleared  One-Way Collateralized                nan   
23515  Allocation    Cleared  One-Way Collateralized                nan   
33310   Amendment  Uncleared        Uncollateralized           End-user   

         Execution Venue                UPI     Contract Type  \
18570  Off Facility Swap                nan  InterestRateSwap   
18569  Off Facility Swap                nan  InterestRateSwap   
23513  Off Facility Swap                nan  InterestRateSwap   
23515  Off Facility Swap                nan  InterestRateSwap   
33310  Off Facility Swap  InterestRate prod          CapFloor   

      Upfront Payment Currency Settlement Currency Leg 1 Type  ...  \
18570                      nan                 USD      Float  ...   


In [19]:
#--------------------------------------------------------------------------------
# Module 6: Conditionally remove selected columns
#   1. Specify the columns to be removed with a True/False flag in a dictionary.
#   2. Use the drop method to remove columns marked as True.
#--------------------------------------------------------------------------------

# Dictionary of columns with True/False flags for removal
columns_to_remove = {
    "Event": False,
    "Execution Timestamp": True,
    "Dissemination Time": True,
    "Cleared": False,
    "Collateralization": False,
    "End-User Exception": False,
    "Bespoke": True,
    "Block/Off facility": False,
    "Execution Venue": False,
    "UPI": True,
    "Fixed Float Swap": False,
    "Contract Type": False,
    "Effective Date": True,
    "Maturity Date": True,
    "Upfront Payment": False,
    "Upfront Payment Currency": False,
    "Upfront Payment Date": True,
    "Settlement Currency": False,
    "Leg 1 Type": False,
    "Leg 1 Fixed Rate": False,
    "Leg 1 Floating Index": False,
    "Leg 1 Designated Maturity": False,
    "Leg 1 Spread": False,
    "Leg 1 Day Count Convention": False,
    "Leg 1 Notional": False,
    "Leg 1 Notional Currency": False,
    "Leg 1 Payment Frequency": False,
    "Leg1 Reset Frequency": False,
    "Leg 2 Type": False,
    "Leg 2 Fixed Rate": False,
    "Leg 2 Floating Index": False,
    "Leg 2 Designated Maturity": True,
    "Leg 2 Spread": True,
    "Leg 2 Day Count Convention": False,
    "Leg 2 Notional": True,
    "Leg 2 Notional Currency": True,
    "Leg 2 Payment Frequency": False,
    "Leg 2 Reset Frequency": False,
    "Embedded Option": False,
    "Option Strike Price": True,
    "Option Type": True,
    "Option Family": True,
    "Option Currency": True,
    "Option Premium": True,
    "Option Lockout Period": True,
    "Option Expiration Date": True,
    "Asset Class": True,
    "Rpt ID": True,
    "Prev Rpt ID": True,
    "Future Value Notional": True,
    "Contract Subtype": False
}

# Remove columns marked as True
removed_columns = [col for col, remove in columns_to_remove.items() if remove and col in data.columns]
data.drop(removed_columns, axis=1, inplace=True)

data.to_csv("Cleaned Data/RT.IRS_Clean_v3.csv", index=False)

# Display the list of removed columns
print("Removed columns:", removed_columns)


Removed columns: ['Execution Timestamp', 'Dissemination Time', 'Bespoke', 'UPI', 'Effective Date', 'Maturity Date', 'Upfront Payment Date', 'Leg 2 Designated Maturity', 'Leg 2 Spread', 'Leg 2 Notional', 'Leg 2 Notional Currency', 'Option Strike Price', 'Option Type', 'Option Family', 'Option Currency', 'Option Premium', 'Option Lockout Period', 'Option Expiration Date', 'Asset Class', 'Rpt ID', 'Prev Rpt ID', 'Future Value Notional']


In [20]:
#--------------------------------------------------------------------------------
# Module 7: Categorize Columns Based on Data Types
#--------------------------------------------------------------------------------

# Categorize columns based on the variables_info dictionary
numerical_columns = [col for col, dtype in variables_info.items() if dtype in ["int", "float"] and col in data.columns]
categorical_columns = [col for col, dtype in variables_info.items() if dtype in ["string", "character"] and col in data.columns]
datetime_columns = [col for col, dtype in variables_info.items() if dtype in ["date", "datetime"] and col in data.columns]

# Display categorized columns
print("Numerical columns: ", numerical_columns)
print("Categorical columns: ", categorical_columns)
print("Datetime columns: ", datetime_columns)


Numerical columns:  ['Upfront Payment', 'Leg 1 Fixed Rate', 'Leg 1 Spread', 'Leg 1 Notional', 'Leg 2 Fixed Rate']
Categorical columns:  ['Event', 'Cleared', 'Collateralization', 'End-User Exception', 'Block/Off facility', 'Execution Venue', 'Contract Type', 'Upfront Payment Currency', 'Settlement Currency', 'Leg 1 Type', 'Leg 1 Floating Index', 'Leg 1 Designated Maturity', 'Leg 1 Day Count Convention', 'Leg 1 Notional Currency', 'Leg 1 Payment Frequency', 'Leg1 Reset Frequency', 'Leg 2 Type', 'Leg 2 Floating Index', 'Leg 2 Day Count Convention', 'Leg 2 Payment Frequency', 'Leg 2 Reset Frequency', 'Embedded Option', 'Contract Subtype']
Datetime columns:  []


In [21]:
#--------------------------------------------------------------------------------
# Module 8: List unique values count for each categorical column 
#   1. Used to understand how extensive the one hot encoding will be
#   2. Reveals if any columns shuld be removed (too many unique values)
#--------------------------------------------------------------------------------

unique_counts = {}

for column in data:
    unique_counts[column] = data[column].nunique()

# Display the counts
for column, count in unique_counts.items():
    print(f"{column}: {count} unique values")


Event: 27 unique values
Cleared: 4 unique values
Collateralization: 4 unique values
End-User Exception: 3 unique values
Block/Off facility: 2 unique values
Execution Venue: 4 unique values
Product: 885 unique values
Contract Type: 4 unique values
Upfront Payment: 645 unique values
Upfront Payment Currency: 2 unique values
Settlement Currency: 27 unique values
Leg 1 Type: 5 unique values
Leg 1 Fixed Rate: 7895 unique values
Leg 1 Floating Index: 46 unique values
Leg 1 Designated Maturity: 453 unique values
Leg 1 Spread: 7 unique values
Leg 1 Day Count Convention: 7 unique values
Leg 1 Notional: 817 unique values
Leg 1 Notional Currency: 27 unique values
Leg 1 Payment Frequency: 10 unique values
Leg1 Reset Frequency: 8 unique values
Leg 2 Type: 5 unique values
Leg 2 Fixed Rate: 91 unique values
Leg 2 Floating Index: 230 unique values
Leg 2 Day Count Convention: 7 unique values
Leg 2 Payment Frequency: 10 unique values
Leg 2 Reset Frequency: 8 unique values
Embedded Option: 2 unique value

In [22]:
#--------------------------------------------------------------------------------
# Module 9: Split Date/Timestamp Columns into Year, Month, Day
#--------------------------------------------------------------------------------

# Function to split date or datetime columns into year, month, and day
def split_date_columns(df, date_or_datetime_cols, removed_cols, numerical_cols):
    for column in date_or_datetime_cols:
        if column not in removed_cols:
            # Determine if the column is date or datetime type
            col_type = variables_info[column]

            # Convert to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')

            # Add year, month, and day columns
            year_col = f"{column}_year"
            month_col = f"{column}_month"
            day_col = f"{column}_day"
            df[year_col] = df[column].dt.year
            df[month_col] = df[column].dt.month
            df[day_col] = df[column].dt.day
            numerical_cols.extend([year_col, month_col, day_col])

            # For datetime columns, also add hour and minute
            if col_type == "datetime":
                hour_col = f"{column}_hour"
                minute_col = f"{column}_minute"
                df[hour_col] = df[column].dt.hour
                df[minute_col] = df[column].dt.minute
                numerical_cols.extend([hour_col, minute_col])

            # Drop the original column
            df.drop(column, axis=1, inplace=True)

# Apply the function to the dataframe
split_date_columns(data, datetime_columns, removed_columns, numerical_columns)

# Display the updated dataframe
print(data.head())


            Event    Cleared       Collateralization End-User Exception  \
18570  Allocation    Cleared  One-Way Collateralized                nan   
18569  Allocation    Cleared  One-Way Collateralized                nan   
23513  Allocation    Cleared  One-Way Collateralized                nan   
23515  Allocation    Cleared  One-Way Collateralized                nan   
33310   Amendment  Uncleared        Uncollateralized           End-user   

      Block/Off facility    Execution Venue                    Product  \
18570                  N  Off Facility Swap  USD3L-20130619-20230619-2   
18569                  N  Off Facility Swap  USD3L-20130619-20230619-2   
23513                  N  Off Facility Swap  USD3L-20130619-20230619-2   
23515                  N  Off Facility Swap  USD3L-20130619-20230619-2   
33310                  N  Off Facility Swap      InterestRate:CapFloor   

          Contract Type  Upfront Payment Upfront Payment Currency  ...  \
18570  InterestRateSwap       

In [23]:
#--------------------------------------------------------------------------------
# Module 10: Normalization for machine learning suitability
#   1. Normalize numerical features to ensure they have a similar scale.
#   2. One-hot encode categorical features to convert them into a format suitable for the machine learning model.
#--------------------------------------------------------------------------------

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Normalize numerical features
scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, drop_first=True)

# Display the shape and first few rows of the transformed dataset
data_encoded_shape = data_encoded.shape
data_encoded_head = data_encoded.head()

data_encoded_shape, data_encoded_head


((27326, 1765),
        Upfront Payment  Leg 1 Fixed Rate  Leg 1 Spread  Leg 1 Notional  \
 18570         0.496424          0.002164      0.000000    9.979034e-07   
 18569         0.496424          0.008587      0.000000    9.979034e-07   
 23513         0.496424          0.008587      0.000000    9.979034e-07   
 23515         0.496424          0.002164      0.000000    9.979034e-07   
 33310         0.496424          0.644479      0.753012    1.007988e-05   
 
        Leg 2 Fixed Rate  Event_Amendment  Event_BLENDING_REMNANT  \
 18570          0.013324            False                   False   
 18569          0.003358            False                   False   
 23513          0.003358            False                   False   
 23515          0.013324            False                   False   
 33310          0.003358             True                   False   
 
        Event_Cancellation  Event_Clearing Novation  Event_Compression  ...  \
 18570               False           

In [24]:
#--------------------------------------------------------------------------------
# Module 10.5: Display rows of normalized columns
#   1. Select only normalized (numerical) columns from the dataset.
#   2. Display the first 10 rows of these columns.
#--------------------------------------------------------------------------------

# Selecting only normalized (numerical) columns
normalized_columns = data_encoded[numerical_columns]

# Display the first 10 rows of the normalized columns
normalized_data_head = normalized_columns.head(100)

normalized_data_head


Unnamed: 0,Upfront Payment,Leg 1 Fixed Rate,Leg 1 Spread,Leg 1 Notional,Leg 2 Fixed Rate
18570,0.496424,0.002164,0.000000,9.979034e-07,0.013324
18569,0.496424,0.008587,0.000000,9.979034e-07,0.003358
23513,0.496424,0.008587,0.000000,9.979034e-07,0.003358
23515,0.496424,0.002164,0.000000,9.979034e-07,0.013324
33310,0.496424,0.644479,0.753012,1.007988e-05,0.003358
...,...,...,...,...,...
1065,0.496424,0.008940,0.000000,1.915177e-05,0.003358
1063,0.496424,0.009262,0.000000,2.419171e-05,0.003358
1062,0.496424,0.011285,0.000000,4.233550e-05,0.003358
1069,0.496424,0.011028,0.000000,1.007988e-05,0.003358


In [25]:
#--------------------------------------------------------------------------------
# Module 11: Save Processed Data for Later Use in GAN Model
#--------------------------------------------------------------------------------

import os
import pickle

# Ensure the directory "Processed data" exists
if not os.path.exists("Processed data"):
    os.makedirs("Processed data")

# Save the DataFrame as a pickled file
with open("Processed data/data_encoded.pkl", "wb") as file:
    pickle.dump(data_encoded, file)

# Also save the DataFrame as a CSV file for easy access
data_encoded.to_csv("Processed data/data_encoded.csv", index=False)

print("Data saved in both pickle and CSV formats.")


Data saved in both pickle and CSV formats.
