# Mock Data Creator#

**Setup Helpers**

In [75]:
import numpy as np
import datetime as dt
import pandas
import random

def random_date_range(start_tuple:tuple, end_tuple:tuple, nr:int, format_string:str):
    start_date = dt.date(*start_tuple)
    end_date = dt.date(*end_tuple)
    delta_in_days = (end_date - start_date).days
    if delta_in_days != 0:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))  
    else:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))    
    formatted_dates = list(map(lambda x: x.strftime(format_string), dates)) 
    return formatted_dates

def corrupt_some_with(x:list, percent:float, wrong:list):
    '''
    Corrupts x with p percent of records selected from wrong
    '''
    new_x = np.array(x)
    n = int( percent / 100 * len(x))
    index = np.random.choice(len(x), n, replace=False)
    target = np.random.choice(wrong, n)
    new_x[index] = target
    return new_x

def generate_from_pattern(pattern:str):
    gen=""
    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"\
              "T", "U", "V", "W", "X", "Y", "Z"]
    for p in pattern:
        if p == "#":
            chr = numbers[random.randint(0, len(numbers)-1)]
        elif p == "$":
            chr = letters[random.randint(0, len(letters)-1)]
        else: chr = p
        gen = gen + chr
    return gen

def choose_once_from(choice):
    return choice[random.randint(0, len(choice)-1)]

def generate_float_to_precision(mx:float, p:int):
#    f = round(random.lognormvariate(0, 1)*mx /3, 2)
    f = round(random.gammavariate(1, 0.5)*mx, p)
    return f

def gen_and_flip(x):
    n = generate_float_to_precision(1e6, 2)
    if len(x) > 0:
        if x[-1] == "L": n = -n
    return n
    
def ccy_convert_with_error(x:tuple, ccy_rates_for_aud, p):
    delta = random.normalvariate(0, 0.1)
    if x[1] in ccy_rates_for_aud:
        amount = float(x[0]) * (ccy_rates_for_aud[x[1]] + delta)
    else: amount = -1 * float(x[0])
    return round(amount, p)

**Go**

In [76]:
number_of_records = 10000
nr = number_of_records

record_type = corrupt_some_with(np.random.choice(["B"], nr), 5, [""])

nde_types = ["COL", "MAR", "NONMAR", "ISDA", "CSA"]
nde_type_codes = corrupt_some_with(np.random.choice(nde_types, nr), 5, [""])

saracen_codes  = ["6001", "6280", "9685CU", "2026", "xxxx"]
saracen_code_probabilities = [0.25, 0.2, 0.2, 0.3, 0.05]
legal_entity = corrupt_some_with(np.random.choice(saracen_codes, nr, p=saracen_code_probabilities), 5, [""])
entity_saracen_code = corrupt_some_with(np.random.choice(saracen_codes, nr, p=saracen_code_probabilities), 5, [""])

dates = random_date_range((2000, 1, 1), (2010, 1, 2), nr, "%d/%m/%Y")
dates = corrupt_some_with(dates, 5, ["xx-yy-zzzz", "", "30-02-1900"])

data = np.column_stack((record_type, nde_type_codes, legal_entity, entity_saracen_code, dates))

**Generate csv**

In [77]:
panda = pandas.DataFrame(data)
#np.savetxt("sdi.log", array_data, delimiter="|")
panda.to_csv("sdi.log", header=None, sep="|", index=False)

**First few records**

In [78]:
panda.head()

Unnamed: 0,0,1,2,3,4
0,B,CSA,6280,xxxx,11/02/2002
1,B,ISDA,6001,6280,25/07/2000
2,B,,6280,6280,19/01/2000
3,B,COL,6001,9685CU,30/06/2003
4,B,CSA,2026,,26/05/2001


**Last few records**

In [79]:
panda.tail()

Unnamed: 0,0,1,2,3,4
9995,B,COL,,xxxx,23/11/2005
9996,B,COL,2026,6280,21/09/2008
9997,B,COL,2026,9685CU,02/10/2000
9998,B,CSA,9685CU,6280,18/12/2004
9999,B,ISDA,2026,6001,21/10/2006


In [80]:
%%bash 
cat sdi.log

B|CSA|6280|xxxx|11/02/2002
B|ISDA|6001|6280|25/07/2000
B||6280|6280|19/01/2000
B|COL|6001|9685CU|30/06/2003
B|CSA|2026||26/05/2001
B|NONMAR|9685CU|6001|12/11/2009
B|CSA|6001|2026|18/07/2009
B|COL||2026|05/01/2006
B|NONMAR|6001|6280|07/07/2006
B|NONMAR|2026|2026|30/11/2006
B|COL|6280|2026|21/04/2007
B||2026|2026|01/04/2008
B|MAR|9685CU|6001|03/05/2003
B|NONMAR|6001|6280|24/12/2000
B|ISDA|9685CU|6001|18/11/2001
B|NONMAR|6001|2026|12/06/2007
B|COL|6280||15/07/2008
|MAR|6001||26/05/2003
B|CSA|2026|9685CU|xx-yy-zzzz
B|CSA|2026|6280|30/07/2007
B|NONMAR|2026|2026|31/08/2007
B|COL|9685CU|2026|10/08/2001
B|CSA|2026|6001|01/03/2004
B|ISDA|xxxx|9685CU|25/11/2006
B|COL|2026|9685CU|20/04/2007
B|NONMAR|2026|6001|24/03/2006
B|CSA|6001|6280|16/02/2007
B|COL|6001|2026|21/01/2005
B|CSA|6280|9685CU|12/10/2006
B|ISDA|6001|2026|16/11/2000
B|COL|6280|2026|22/08/2009
B|NONMAR|6001|xxxx|27/11/2004
B|ISDA|6280|xxxx|12/12/2005
B||9685CU|9685CU|29/08/2003
B|MAR|6001|2026|11/10/2004
B|NONMAR|6001|9685CU|07/09/200

**Now for a real example**

![](IMG_3674.JPG)

**Set number of records**

In [81]:
record_count = 100

**Record type**

In [82]:
record_type = np.random.choice(["B"], record_count)
record_type = corrupt_some_with(record_type, 4, ["","X"])
record_type

array(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', '', 'B', 'B', 'B', 'B', 'X', 'B', 'B', 'B', '', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'X', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'],
      dtype='<U1')

**Legal Entity**

In [83]:
legal_entity = np.random.choice(["HUBAU"], record_count)
legal_entity = corrupt_some_with(legal_entity, 5, ["", "HBEU", "xxx"])
legal_entity

array(['HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'xxx', 'HUBAU', '',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'xxx', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', '', 'HUBAU', 'HUBAU',
       'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU',
       'HBEU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU', 'HUBAU'],
      dtype='<U5

**Construct a table**

In [84]:
pandas.DataFrame(np.column_stack((record_type, legal_entity))).head()

Unnamed: 0,0,1
0,B,HUBAU
1,B,HUBAU
2,B,HUBAU
3,B,HUBAU
4,B,HUBAU


**Moving faster**

In [85]:
col3 = np.random.choice(["4435"], record_count)
col3 = corrupt_some_with(col3, 5, [""])

col4 = np.random.choice([""], record_count)

col5 = np.random.choice(["AUHBAP"], record_count)
col5 = corrupt_some_with (col5, 5, ["","AHUBAP"])

col6 = list(map(lambda x: "0"+generate_from_pattern("####")+"-"+choose_once_from(["A", "L"]), record_type))
col6 = corrupt_some_with(col6, 5, ["", "x9999-L", "03400-x"])

col7 = np.random.choice(["AUD", "SGD", "CNY"], record_count, p=[0.8, 0.1, 0.1])
col7 = corrupt_some_with(col7, 5, ["", "xxx"])

col8 = list(map(lambda x: gen_and_flip(x), col6))
col8 = corrupt_some_with(col8, 5, ["-999.990"])

col9 = list(map(lambda x: "B", record_type))
col9 = corrupt_some_with(col9, 5, ["", "x"])

col10 = list(map(lambda x: generate_from_pattern("$$#####"), record_type))
col10 = corrupt_some_with(col10, 5, ["", "AA00000"])

ccy_rates_for_aud = {
    "AUD": 1,
    "CNY": 5.23,
    "SGD": 1.08
}
col11 = list(map(lambda x: ccy_convert_with_error(x, ccy_rates_for_aud, 3), list(zip(col8, col7))))

col12 = np.random.choice(["AUD"], record_count)
col12 = corrupt_some_with(col12, 5, [""])

res = pandas.DataFrame(np.column_stack((record_type, legal_entity, col3, col4, col5, col6, col7, col8, col9, \
                                        col10, col11, col12)))

In [86]:
res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,B,HUBAU,4435,,AUHBAP,01730-L,CNY,-380470.39,B,ZR53800,-2000888.187,AUD
1,B,HUBAU,4435,,AUHBAP,05828-L,AUD,-2033638.97,B,IU49855,-2381096.435,AUD
2,B,HUBAU,4435,,AUHBAP,03519-A,AUD,4268427.32,B,JH07560,4382574.12,AUD
3,B,HUBAU,4435,,AUHBAP,08810-L,AUD,-304970.31,B,YG84702,-301587.56,AUD
4,B,HUBAU,4435,,AUHBAP,02942-A,AUD,234117.4,B,XD39374,240615.972,AUD


![](IMG_3674.JPG)

**Create the file**

In [87]:
res.to_csv("ledger.sdi.log", header=None, sep="|", index=False)

## Wait there's more!...##

**How about formatting it as AVRO**?

**Add technical headings**

In [88]:
res.columns = [
    "RecordType",
    "SystemId",
    "SaracenCode",
    "Unknown1",
    "ReportingEntity",
    "Account",
    "OriginalCCY",
    "AmountInOriginalCCY",
    "BalanceType",
    "Unknown2",
    "AmountInReportingCCY",
    "ReportingCCY"
]
res.head()

Unnamed: 0,RecordType,SystemId,SaracenCode,Unknown1,ReportingEntity,Account,OriginalCCY,AmountInOriginalCCY,BalanceType,Unknown2,AmountInReportingCCY,ReportingCCY
0,B,HUBAU,4435,,AUHBAP,01730-L,CNY,-380470.39,B,ZR53800,-2000888.187,AUD
1,B,HUBAU,4435,,AUHBAP,05828-L,AUD,-2033638.97,B,IU49855,-2381096.435,AUD
2,B,HUBAU,4435,,AUHBAP,03519-A,AUD,4268427.32,B,JH07560,4382574.12,AUD
3,B,HUBAU,4435,,AUHBAP,08810-L,AUD,-304970.31,B,YG84702,-301587.56,AUD
4,B,HUBAU,4435,,AUHBAP,02942-A,AUD,234117.4,B,XD39374,240615.972,AUD


In [102]:
res.describe()

Unnamed: 0,RecordType,SystemId,SaracenCode,Unknown1,ReportingEntity,Account,OriginalCCY,AmountInOriginalCCY,BalanceType,Unknown2,AmountInReportingCCY,ReportingCCY
count,100,100,100,100.0,100,100.0,100,100.0,100,100,100.0,100
unique,3,4,2,1.0,3,97.0,5,96.0,3,97,100.0,2
top,B,HUBAU,4435,,AUHBAP,,AUD,-999.99,B,AA00000,-54651.54,AUD
freq,96,95,95,100.0,95,3.0,78,5.0,95,4,1.0,95


In [107]:
res.AmountInOriginalCCY.astype('float').describe()

count    1.000000e+02
mean     4.261001e+04
std      7.516918e+05
min     -2.033639e+06
25%     -3.210621e+05
50%     -9.999900e+02
75%      3.107056e+05
max      4.268427e+06
Name: AmountInOriginalCCY, dtype: float64

**Save as sdi with heading**

In [89]:
res.to_csv("enhanced.ledger.sdi.log", sep="|", index=False)

**Initial Avro schema**

In [93]:
initial_schema = {
    "namespace": "example.avro",
    "type": "record",
    "name": "ledger",
    "fields": [
        {"name": "RecordType", "type": ["string", "null"]},
        {"name": "SystemId", "type": ["string", "null"]},
        {"name": "SaracenCode", "type": ["string", "null"]},
        {"name": "Unknown1", "type": ["string", "null"]},
        {"name": "ReportingEntity", "type": ["string", "null"]},
        {"name": "Account", "type": ["string", "null"]},
        {"name": "OriginalCCY", "type": ["string", "null"]},
        {"name": "AmountInOriginalCCY", "type": ["string", "null"]},
        {"name": "BalanceType", "type": ["string", "null"]},
        {"name": "Unknown2", "type": ["string", "null"]},
        {"name": "AmountInReportingCCY", "type": ["string", "null"]},
        {"name": "ReportingCCY", "type": ["string", "null"]}
    ]
}

**Write Avro with initial schema**

In [91]:
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

In [101]:
schema = avro.schema.Parse(json.dumps(initial_schema))
writer = DataFileWriter(open("ledger.sdi.avro", "wb"), DatumWriter(), schema)
for _, row in res.iterrows():
    print(type(row))
    for r in row:
        print(r, type(r))
    print(row.to_dict())
    writer.append(row.to_dict())
writer.close()

<class 'pandas.core.series.Series'>
B <class 'str'>
HUBAU <class 'str'>
4435 <class 'str'>
 <class 'str'>
AUHBAP <class 'str'>
01730-L <class 'str'>
CNY <class 'str'>
-380470.39 <class 'str'>
B <class 'str'>
ZR53800 <class 'str'>
-2000888.187 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '4435', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '01730-L', 'OriginalCCY': 'CNY', 'AmountInOriginalCCY': '-380470.39', 'BalanceType': 'B', 'Unknown2': 'ZR53800', 'AmountInReportingCCY': '-2000888.187', 'ReportingCCY': 'AUD'}
<class 'pandas.core.series.Series'>
B <class 'str'>
HUBAU <class 'str'>
4435 <class 'str'>
 <class 'str'>
AUHBAP <class 'str'>
05828-L <class 'str'>
AUD <class 'str'>
-2033638.97 <class 'str'>
B <class 'str'>
IU49855 <class 'str'>
-2381096.435 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '4435', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '05828-L', 'OriginalCCY': 'AUD', 

AA00000 <class 'str'>
-595174.744 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '4435', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '07891-L', 'OriginalCCY': 'AUD', 'AmountInOriginalCCY': '-527878.62', 'BalanceType': 'B', 'Unknown2': 'AA00000', 'AmountInReportingCCY': '-595174.744', 'ReportingCCY': 'AUD'}
<class 'pandas.core.series.Series'>
B <class 'str'>
HUBAU <class 'str'>
4435 <class 'str'>
 <class 'str'>
AUHBAP <class 'str'>
07904-A <class 'str'>
SGD <class 'str'>
50152.54 <class 'str'>
B <class 'str'>
STU52374 <class 'str'>
58891.517 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '4435', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '07904-A', 'OriginalCCY': 'SGD', 'AmountInOriginalCCY': '50152.54', 'BalanceType': 'B', 'Unknown2': 'STU52374', 'AmountInReportingCCY': '58891.517', 'ReportingCCY': 'AUD'}
<class 'pandas.core.series.Series'>
B <class 'str'>
HUBAU <class 'str'>
4435

2441836.63 <class 'str'>
B <class 'str'>
ZJ73762 <class 'str'>
1752793.456 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '05285-A', 'OriginalCCY': 'AUD', 'AmountInOriginalCCY': '2441836.63', 'BalanceType': 'B', 'Unknown2': 'ZJ73762', 'AmountInReportingCCY': '1752793.456', 'ReportingCCY': 'AUD'}
<class 'pandas.core.series.Series'>
B <class 'str'>
HUBAU <class 'str'>
 <class 'str'>
 <class 'str'>
AUHBAP <class 'str'>
06917-A <class 'str'>
AUD <class 'str'>
204813.59 <class 'str'>
B <class 'str'>
JY99499 <class 'str'>
227365.3 <class 'str'>
AUD <class 'str'>
{'RecordType': 'B', 'SystemId': 'HUBAU', 'SaracenCode': '', 'Unknown1': '', 'ReportingEntity': 'AUHBAP', 'Account': '06917-A', 'OriginalCCY': 'AUD', 'AmountInOriginalCCY': '204813.59', 'BalanceType': 'B', 'Unknown2': 'JY99499', 'AmountInReportingCCY': '227365.3', 'ReportingCCY': 'AUD'}
<class 'pandas.core.series.Series'>
B <class 'str