In [8]:
#--------------------------------------------------------------------------------
# Imports RT.IRS_data.csv and cleans it by removing all rows which does  not have 
# data in the "Event"-column. Further it removes all headers and corresponding 
# columns which is empty (no data).
#--------------------------------------------------------------------------------

import pandas as pd
import os

# Define file paths using relative paths
input_file_path = "Categorized Data/RT.IRS_Data.csv"
output_file_path = "Cleaned Data/RT.IRS_Clean_v1.csv"

# Get the current working directory
current_directory = os.getcwd()

# Construct absolute file paths
input_file_path = os.path.join(current_directory, input_file_path)
output_file_path = os.path.join(current_directory, output_file_path)

# Load the CSV file into a pandas DataFrame
data = pd.read_csv(input_file_path)

# Remove rows where the "Event" column is empty
data = data.dropna(subset=['Event'])

# Remove columns where all values are NaN after removing empty rows
data = data.dropna(axis=1, how='all')

# Remove rows where "Event" column starts with two numbers
data = data[~data['Event'].astype(str).str.match(r'^\d{2}')]

# Save the cleaned data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned and saved to", output_file_path)



  data = pd.read_csv(input_file_path)


Data cleaned and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [9]:
#--------------------------------------------------------------------------------
# Sort the rows based on the "Event" column 
#--------------------------------------------------------------------------------

# Sort the rows based on the "Event" column
data = data.sort_values(by='Event')

# Save the cleaned and sorted data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Data cleaned, sorted, and saved to", output_file_path)


Data cleaned, sorted, and saved to /Users/ollepyk/Documents/GitHub/ME2313-T2/Cleaned Data/RT.IRS_Clean_v1.csv


In [15]:
#--------------------------------------------------------------------------------
# Check the coverage for all the columns in the CSV file and remove columns with coverage less than 20%
#--------------------------------------------------------------------------------

# Calculate the percentage of non-null values in each column
coverage = (data.count() / len(data)) * 100

# Create a DataFrame to store the coverage information
coverage_df = pd.DataFrame({'Column': coverage.index, 'Coverage': coverage.values})

# Sort the coverage DataFrame by coverage percentage in descending order
coverage_df = coverage_df.sort_values(by='Coverage', ascending=False)

# Remove columns with coverage less than 20%
columns_to_keep = coverage_df[coverage_df['Coverage'] >= 20]['Column']
removed_columns_df = coverage_df[coverage_df['Coverage'] < 20]

# Store removed columns information in a separate DataFrame
removed_columns_df = removed_columns_df.rename(columns={'Column': 'Removed Column', 'Coverage': 'Coverage'})
removed_columns_df.index = range(1, len(removed_columns_df) + 1)

filtered_data = data[columns_to_keep]

# Save the cleaned and filtered data to a new CSV file in the Cleaned Data folder
filtered_output_file_path = "Cleaned Data/RT.IRS_Clean_v2.csv"
filtered_data.to_csv(filtered_output_file_path, index=False)

# Print the coverage information
print("Coverage for each column (columns with coverage less than 20% removed):")
print(coverage_df)

# Print the list of removed columns in a formatted way
if not removed_columns_df.empty:
    print("\nRemoved columns:")
    print(removed_columns_df)
else:
    print("\nNo columns were removed.")

print("Filtered data saved to", filtered_output_file_path)


Coverage for each column (columns with coverage less than 20% removed):
                        Column    Coverage
0                        Event  100.000000
13               Maturity Date  100.000000
47                      Rpt ID  100.000000
46                 Asset Class  100.000000
43              Option Premium  100.000000
39         Option Strike Price  100.000000
31   Leg 2 Designated Maturity  100.000000
1          Execution Timestamp  100.000000
24              Leg 1 Notional  100.000000
23  Leg 1 Day Count Convention  100.000000
21   Leg 1 Designated Maturity  100.000000
17         Settlement Currency  100.000000
14             Upfront Payment  100.000000
25     Leg 1 Notional Currency  100.000000
12              Effective Date  100.000000
7           Block/Off facility  100.000000
2           Dissemination Time  100.000000
11               Contract Type  100.000000
3                      Cleared  100.000000
6                      Bespoke  100.000000
10                     Pr