## Step 1: Generate Mock Dataset in Python

In [88]:
import pandas as pd
import numpy as np 
from datetime import datetime, timedelta
import random
#set seed for reproducibility (so results are consistent)
random.seed (42)
np.random.seed(42)

# Number of records
n=10000

# Generate trade_ids (unique sequential IDs)
trade_ids = list(range(1, n+1))

In [128]:
# Generate random trade dates in the last year 
# In order words, I'm saying Give me 10,000 dates. For each date, start at Jan 1, 2024, and jump forward a random number of days (anywhere from 0 to 365 days).
start_date = datetime(2024, 1,1)
trade_dates = [start_date + timedelta(days=random.randint(0, 365)) for _ in range(n)]

In [92]:
# Generate settlement dates: normally trade_dates + 2 days, but 15% delayed by 1-5 extra days 

settlement_dates = []
statuses = []
for td in trade_dates:  # This must loop through all 10,000 trade_dates
    if random.random() < 0.15:
        delay = random.randint(1,5)
        sd = td + timedelta(days=2 + delay)
        statuses.append('Delayed')
    else:
        sd = td + timedelta(days=2)
        statuses.append('Settled')
    settlement_dates.append(sd)

**What the Code Does:**
It goes through each trade_date and calculates a settlement_date. It also assigns a status:

Settled (85% chance): Settlement happens 2 days after the trade (T+2 is a common standard).

Delayed (15% chance): Settlement is late, happening 3 to 7 days after the trade (2 days + a random delay of 1 to 5 extra days).

if random.random() < 0.15:. This line acts as a filter, ensuring that on average, for every 100 trades:



In [94]:
# Generate amounts: random between 1000 and 1,000,000 (rounded to 2 decimals)
amounts = np.random.uniform(1000, 1000000, n).round(2)

In [96]:
# Adding other neccessary columns 
counterparties = [f'Counterparty_{random.randint(1,50)}' for _ in range(n)]
# Use np.random.choice instead of random.choices
instrument_types = np.random.choice(['Stock', 'Bond', 'FX', 'Derivative'], size=n)
currencies = np.random.choice(['USD', 'EUR', 'GBP', 'NGN'], size=n)

In [104]:
# Create DataFrame
trades_df = pd.DataFrame({
    'trade_id':trade_ids,
    'trade_date':trade_dates,
    'settlement_date':settlement_dates,
    'status': statuses,
    'amount': amounts,
    'counterparty': counterparties,
    'instrument_type': instrument_types,
    'currency': currencies
})


In [102]:
# Checking the length of each list for accuracy
print("Length of trade_ids:", len(trade_ids))
print("Length of trade_dates:", len(trade_dates))
print("Length of settlement_dates:", len(settlement_dates))
print("Length of statuses:", len(statuses))
print("Length of amounts:", len(amounts))
print("Length of counterparties:", len(counterparties))
print("Length of instrument_types:", len(instrument_types))
print("Length of currencies:", len(currencies))

Length of trade_ids: 10000
Length of trade_dates: 10000
Length of settlement_dates: 10000
Length of statuses: 10000
Length of amounts: 10000
Length of counterparties: 10000
Length of instrument_types: 10000
Length of currencies: 10000


In [106]:
# Display the first few rows to check it
print(trades_df.head())
print("\nDataFrame shape:", trades_df.shape)

   trade_id trade_date settlement_date   status     amount     counterparty  \
0         1 2024-11-23      2024-11-25  Settled  375165.58  Counterparty_38   
1         2 2024-02-27      2024-02-29  Settled  950763.59  Counterparty_40   
2         3 2024-01-13      2024-01-15  Settled  732261.95  Counterparty_40   
3         4 2024-05-20      2024-05-22  Settled  599059.83  Counterparty_17   
4         5 2024-05-05      2024-05-07  Settled  156862.62  Counterparty_45   

  instrument_type currency  
0              FX      NGN  
1      Derivative      EUR  
2      Derivative      USD  
3            Bond      NGN  
4           Stock      USD  

DataFrame shape: (10000, 8)


In [116]:
# Sort the trade_date so it looks real 

trades_df= trades_df.sort_values('trade_date').reset_index(drop=True)

In [124]:
# Save to CSV to get the dataset

trades_df.to_csv('mock_trade_data.csv', index=False)

In [126]:
trades_df.head(10)

Unnamed: 0,trade_id,trade_date,settlement_date,status,amount,counterparty,instrument_type,currency
0,6254,2024-01-01,2024-01-03,Settled,19063.01,Counterparty_36,FX,GBP
1,180,2024-01-01,2024-01-03,Settled,138383.42,Counterparty_13,Derivative,EUR
2,8293,2024-01-01,2024-01-03,Settled,71845.28,Counterparty_23,FX,EUR
3,3963,2024-01-01,2024-01-03,Settled,29623.78,Counterparty_19,Derivative,NGN
4,6609,2024-01-01,2024-01-03,Settled,936633.65,Counterparty_38,FX,USD
5,7571,2024-01-01,2024-01-03,Settled,367802.46,Counterparty_38,Stock,NGN
6,7912,2024-01-01,2024-01-03,Settled,413498.28,Counterparty_23,Bond,USD
7,3687,2024-01-01,2024-01-03,Settled,929215.57,Counterparty_3,FX,NGN
8,6471,2024-01-01,2024-01-03,Settled,658291.96,Counterparty_21,Derivative,NGN
9,4288,2024-01-01,2024-01-03,Settled,577404.63,Counterparty_48,Stock,NGN
