In [3]:
#--------------------------------------------------------------------------------
# Module 1: Imports RT.IRS_data.csv and cleans it by removing all rows which does  not have 
# data in the "Event"-column. Further it removes all headers and corresponding 
# columns which is empty (no data).
#--------------------------------------------------------------------------------

import pandas as pd
import os

# Define file paths using relative paths
input_file_path = "Categorized Data/RT.IRS_Data.csv"
output_file_path = "Cleaned Data/RT.IRS_Clean_v1.csv"

# Get the current working directory
current_directory = os.getcwd()

# Construct absolute file paths
input_file_path = os.path.join(current_directory, input_file_path)
output_file_path = os.path.join(current_directory, output_file_path)

# Load the CSV file into a pandas DataFrame
data = pd.read_csv(input_file_path)

# Remove rows where the "Event" column is empty
data = data.dropna(subset=['Event'])

# Remove columns where all values are NaN after removing empty rows
data = data.dropna(axis=1, how='all')

# Remove rows where "Event" column starts with two numbers
data = data[~data['Event'].astype(str).str.match(r'^\d{2}')]

# Save the cleaned data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned and saved to", output_file_path)


  data = pd.read_csv(input_file_path)


Data cleaned and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [4]:
#--------------------------------------------------------------------------------
# Module 2: Sort the rows based on the "Event" column 
#--------------------------------------------------------------------------------

# Sort the rows based on the "Event" column
data = data.sort_values(by='Event')

# Save the cleaned and sorted data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned, sorted, and saved to", output_file_path)

Data cleaned, sorted, and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [6]:
#--------------------------------------------------------------------------------
# Module 3: Define parameters
#--------------------------------------------------------------------------------

# Define the variables from the second sheet
variables_info = {
    "Event": "string",
    "Execution Timestamp": "datetime",
    "Dissemination Time": "datetime",
    "Cleared": "string",
    "Collateralization": "string",
    "End-User Exception": "string",
    "Bespoke": "character",
    "Block/Off facility": "character",
    "Execution Venue": "string",
    "UPI": "string",
    "Fixed Float Swap": "string",
    "Contract Type": "string",
    "Effective Date": "date",
    "Maturity Date": "date",
    "Upfront Payment": "int",
    "Upfront Payment Currency": "string",
    "Upfront Payment Date": "date",
    "Settlement Currency": "string",
    "Leg 1 Type": "string",
    "Leg 1 Fixed Rate": "int",
    "Leg 1 Floating Index": "string",
    "Leg 1 Designated Maturity": "string",
    "Leg 1 Spread": "float",
    "Leg 1 Day Count Convention": "string",
    "Leg 1 Notional": "int",
    "Leg 1 Notional Currency": "string",
    "Leg 1 Payment Frequency": "string",
    "Leg1 Reset Frequency": "string",
    "Leg 2 Type": "string",
    "Leg 2 Fixed Rate": "float",
    "Leg 2 Floating Index": "string",
    "Leg 2 Designated Maturity": "string",
    "Leg 2 Spread": "float",
    "Leg 2 Day Count Convention": "string",
    "Leg 2 Notional": "float",
    "Leg 2 Notional Currency": "string",
    "Leg 2 Payment Frequency": "string",
    "Leg 2 Reset Frequency": "string",
    "Embedded Option": "character",
    "Option Strike Price": "float",
    "Option Type": "string",
    "Option Family": "string",
    "Option Currency": "string",
    "Option Premium": "float",
    "Option Lockout Period": "date",
    "Option Expiration Date": "date",
    "Asset Class": "string",
    "Rpt ID": "string",
    "Prev Rpt ID": "string",
    "Future Value Notional": "float",
    "Contract Subtype": "string"
}

print("Variables defined.")


Variables defined.


In [7]:
#--------------------------------------------------------------------------------
# Module 4: Adjust columns according to variables_info and save to CSV
#--------------------------------------------------------------------------------

for column, data_type in variables_info.items():
    if column in data.columns:
        if data_type == "string":
            data[column] = data[column].astype(str)
        elif data_type == "int":
            # Convert to numeric with NaN for non-convertible values
            data[column] = pd.to_numeric(data[column], errors='coerce')
            # Optional: Fill NaN with a placeholder like 0 or -1
            # data[column] = data[column].fillna(0).astype('Int64')
        elif data_type == "float":
            data[column] = pd.to_numeric(data[column], errors='coerce')
        elif data_type == "datetime":
            data[column] = pd.to_datetime(data[column], errors='coerce')
        elif data_type == "date":
            data[column] = pd.to_datetime(data[column], errors='coerce').dt.date
        # Add more conditions as needed for other data types


# Define the output file path for the cleaned data
output_file_path = os.path.join(current_directory, "Cleaned Data/RT.IRS_Clean_v2.csv")

# Save the adjusted data to a CSV file
data.to_csv(output_file_path, index=False)

print("Data adjusted and saved to", output_file_path)


Data adjusted and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_vX.csv


In [8]:
#--------------------------------------------------------------------------------
# Module 5: Display DataFrame rows separated by data types
#--------------------------------------------------------------------------------

# Group columns by their data types
columns_by_type = {}
for col, dtype in variables_info.items():
    if col in data.columns:
        columns_by_type.setdefault(dtype, []).append(col)

# Print the top 5 rows for each data type
for dtype, cols in columns_by_type.items():
    print(f"Data Type: {dtype}")
    print(data[cols].head(5))
    print("\n")



Data Type: string
            Event    Cleared       Collateralization End-User Exception  \
18570  Allocation    Cleared  One-Way Collateralized                nan   
18569  Allocation    Cleared  One-Way Collateralized                nan   
23513  Allocation    Cleared  One-Way Collateralized                nan   
23515  Allocation    Cleared  One-Way Collateralized                nan   
33310   Amendment  Uncleared        Uncollateralized           End-user   

         Execution Venue                UPI     Contract Type  \
18570  Off Facility Swap                nan  InterestRateSwap   
18569  Off Facility Swap                nan  InterestRateSwap   
23513  Off Facility Swap                nan  InterestRateSwap   
23515  Off Facility Swap                nan  InterestRateSwap   
33310  Off Facility Swap  InterestRate prod          CapFloor   

      Upfront Payment Currency Settlement Currency Leg 1 Type  ...  \
18570                      nan                 USD      Float  ...   


In [9]:
#--------------------------------------------------------------------------------
# Module 6: Divide columns to cat & num
#--------------------------------------------------------------------------------


# Categorize columns based on the variables_info dictionary
numerical_columns = [col for col, dtype in variables_info.items() if dtype in ["int", "float"] and col in data.columns]
categorical_columns = [col for col, dtype in variables_info.items() if dtype in ["string", "character"] and col in data.columns]

# Fill missing values in numerical columns with their median
for column in numerical_columns:
    median_value = data[column].median()
    data[column].fillna(median_value, inplace=True)

# Fill missing values in categorical columns with their mode
for column in categorical_columns:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

# Identify columns with remaining missing values
columns_with_missing_values = data.columns[data.isnull().any()]

print("Columns with remaining missing values: ", columns_with_missing_values.tolist())
print("Numerical columns: ", numerical_columns)
print("Categorical columns: ", categorical_columns)



Columns with remaining missing values:  ['Effective Date', 'Maturity Date', 'Upfront Payment Date', 'Option Lockout Period', 'Option Expiration Date']
Numerical columns:  ['Upfront Payment', 'Leg 1 Fixed Rate', 'Leg 1 Spread', 'Leg 1 Notional', 'Leg 2 Fixed Rate', 'Leg 2 Spread', 'Leg 2 Notional', 'Option Strike Price', 'Option Premium', 'Future Value Notional']
Categorical columns:  ['Event', 'Cleared', 'Collateralization', 'End-User Exception', 'Bespoke', 'Block/Off facility', 'Execution Venue', 'UPI', 'Contract Type', 'Upfront Payment Currency', 'Settlement Currency', 'Leg 1 Type', 'Leg 1 Floating Index', 'Leg 1 Designated Maturity', 'Leg 1 Day Count Convention', 'Leg 1 Notional Currency', 'Leg 1 Payment Frequency', 'Leg1 Reset Frequency', 'Leg 2 Type', 'Leg 2 Floating Index', 'Leg 2 Designated Maturity', 'Leg 2 Day Count Convention', 'Leg 2 Notional Currency', 'Leg 2 Payment Frequency', 'Leg 2 Reset Frequency', 'Embedded Option', 'Option Type', 'Option Family', 'Option Currency', '

In [10]:
#--------------------------------------------------------------------------------
# Module 7: List unique values count for each categorical column 
#   1. Used to understand how extensive the one hot encoding will be
#   2. Reveals if any columns shuld be removed (too many unique values)
#--------------------------------------------------------------------------------

unique_counts = {}

for column in categorical_columns:
    unique_counts[column] = data[column].nunique()

# Display the counts
for column, count in unique_counts.items():
    print(f"{column}: {count} unique values")


Event: 27 unique values
Cleared: 4 unique values
Collateralization: 4 unique values
End-User Exception: 3 unique values
Bespoke: 2 unique values
Block/Off facility: 2 unique values
Execution Venue: 4 unique values
UPI: 4 unique values
Contract Type: 4 unique values
Upfront Payment Currency: 2 unique values
Settlement Currency: 27 unique values
Leg 1 Type: 5 unique values
Leg 1 Floating Index: 46 unique values
Leg 1 Designated Maturity: 453 unique values
Leg 1 Day Count Convention: 7 unique values
Leg 1 Notional Currency: 27 unique values
Leg 1 Payment Frequency: 10 unique values
Leg1 Reset Frequency: 8 unique values
Leg 2 Type: 5 unique values
Leg 2 Floating Index: 230 unique values
Leg 2 Designated Maturity: 451 unique values
Leg 2 Day Count Convention: 7 unique values
Leg 2 Notional Currency: 28 unique values
Leg 2 Payment Frequency: 10 unique values
Leg 2 Reset Frequency: 8 unique values
Embedded Option: 2 unique values
Option Type: 2 unique values
Option Family: 3 unique values
Opt

In [11]:
#--------------------------------------------------------------------------------
# Module 8: Conditionally remove selected columns
#   1. Specify the columns to be removed with a True/False flag in a dictionary.
#   2. Use the drop method to remove columns marked as True.
#--------------------------------------------------------------------------------

# Dictionary of columns with True/False flags for removal
columns_to_remove = {
    "Event": False,
    "Execution Timestamp": False,
    "Dissemination Time": False,
    "Cleared": False,
    "Collateralization": False,
    "End-User Exception": False,
    "Bespoke": False,
    "Block/Off facility": False,
    "Execution Venue": False,
    "UPI": True,
    "Fixed Float Swap": False,
    "Contract Type": False,
    "Effective Date": False,
    "Maturity Date": False,
    "Upfront Payment": False,
    "Upfront Payment Currency": False,
    "Upfront Payment Date": False,
    "Settlement Currency": False,
    "Leg 1 Type": False,
    "Leg 1 Fixed Rate": False,
    "Leg 1 Floating Index": False,
    "Leg 1 Designated Maturity": False,
    "Leg 1 Spread": False,
    "Leg 1 Day Count Convention": False,
    "Leg 1 Notional": False,
    "Leg 1 Notional Currency": False,
    "Leg 1 Payment Frequency": False,
    "Leg1 Reset Frequency": False,
    "Leg 2 Type": False,
    "Leg 2 Fixed Rate": False,
    "Leg 2 Floating Index": False,
    "Leg 2 Designated Maturity": False,
    "Leg 2 Spread": False,
    "Leg 2 Day Count Convention": False,
    "Leg 2 Notional": False,
    "Leg 2 Notional Currency": False,
    "Leg 2 Payment Frequency": False,
    "Leg 2 Reset Frequency": False,
    "Embedded Option": False,
    "Option Strike Price": False,
    "Option Type": False,
    "Option Family": False,
    "Option Currency": False,
    "Option Premium": False,
    "Option Lockout Period": False,
    "Option Expiration Date": False,
    "Asset Class": False,
    "Rpt ID": True,
    "Prev Rpt ID": False,
    "Future Value Notional": False,
    "Contract Subtype": False
}

# Remove columns marked as True
removed_columns = [col for col, remove in columns_to_remove.items() if remove and col in data.columns]
data.drop(removed_columns, axis=1, inplace=True)

# Display the list of removed columns
print("Removed columns:", removed_columns)


Removed columns: ['UPI', 'Rpt ID']


In [9]:
#--------------------------------------------------------------------------------
# Module 9: Date/timestamp management
#   1. Convert "Maturity Date" and "Execution Timestamp" columns to datetime format.
#   2. Extract relevant features from these datetime columns.
#   3. Convert derived date features to categorical values.
#--------------------------------------------------------------------------------

#import pandas as pd

# Convert "Maturity Date" and "Execution Timestamp" columns to datetime format
#data["Maturity Date"] = pd.to_datetime(data["Maturity Date"], errors='coerce')
#data["Execution Timestamp"] = pd.to_datetime(data["Execution Timestamp"], errors='coerce')

# Extract features from "Maturity Date"
#data["Maturity_Day"] = data["Maturity Date"].dt.day
#data["Maturity_Month"] = data["Maturity Date"].dt.month
#data["Maturity_Year"] = data["Maturity Date"].dt.year

# Extract features from "Execution Timestamp"
#data["Execution_Day"] = data["Execution Timestamp"].dt.day
#data["Execution_Month"] = data["Execution Timestamp"].dt.month
#data["Execution_Year"] = data["Execution Timestamp"].dt.year

# Convert the derived features to strings, making them categorical
#categorical_columns = ["Maturity_Day", "Maturity_Month", "Maturity_Year", 
#                       "Execution_Day", "Execution_Month", "Execution_Year"]

#for col in categorical_columns:
#    data[col] = data[col].astype(str)

# Drop the original datetime columns
#data.drop(["Maturity Date", "Execution Timestamp"], axis=1, inplace=True)

# Display the first few rows with the new features
#data.head()


In [12]:
#--------------------------------------------------------------------------------
# Module 10: Normalization for machine learning suitability
#   1. Normalize numerical features to ensure they have a similar scale.
#   2. One-hot encode categorical features to convert them into a format suitable for the machine learning model.
#--------------------------------------------------------------------------------

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Normalize numerical features
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, drop_first=True)

# Display the shape and first few rows of the transformed dataset
data_encoded_shape = data_encoded.shape
data_encoded_head = data_encoded.head()

data_encoded_shape, data_encoded_head


((27326, 8822),
       Execution Timestamp  Dissemination Time  Upfront Payment  \
 18570 2022-10-11 19:43:52 2022-10-11 19:43:52         0.496424   
 18569 2022-10-11 19:43:52 2022-10-11 19:43:52         0.496424   
 23513 2022-10-03 21:08:06 2022-10-03 21:08:06         0.496424   
 23515 2022-10-03 21:08:06 2022-10-03 21:08:06         0.496424   
 33310 2014-07-08 15:25:00 2014-07-08 15:25:00         0.496424   
 
        Leg 1 Fixed Rate  Leg 1 Spread  Leg 1 Notional  Leg 2 Fixed Rate  \
 18570          0.002164      0.000000    9.979034e-07          0.013324   
 18569          0.008587      0.000000    9.979034e-07          0.003358   
 23513          0.008587      0.000000    9.979034e-07          0.003358   
 23515          0.002164      0.000000    9.979034e-07          0.013324   
 33310          0.644479      0.753012    1.007988e-05          0.003358   
 
        Leg 2 Spread  Leg 2 Notional  Option Strike Price  ...  \
 18570           0.0    9.979084e-07                  0.

In [13]:
#--------------------------------------------------------------------------------
# Module 11: Pickle that big boi data for later use in GAN model
#--------------------------------------------------------------------------------

import os
import pickle

# Ensure the directory "Processed data" exists
if not os.path.exists("Processed data"):
    os.makedirs("Processed data")

# Save the DataFrame as a pickled file
with open("Processed data/data_encoded.pkl", "wb") as file:
    pickle.dump(data_encoded, file)
