# Mock Data Creator#

**Setup Helpers**

In [188]:
import numpy as np
import datetime as dt
import pandas
import random

def random_date_range(start_tuple:tuple, end_tuple:tuple, nr:int, format_string:str):
    start_date = dt.date(*start_tuple)
    end_date = dt.date(*end_tuple)
    delta_in_days = (end_date - start_date).days
    if delta_in_days != 0:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))  
    else:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))    
    formatted_dates = list(map(lambda x: x.strftime(format_string), dates)) 
    return formatted_dates

def corrupt_some_with(x:list, percent:float, wrong:list):
    '''
    Corrupts x with p percent of records selected from wrong
    '''
    new_x = np.array(x)
    n = int( percent / 100 * len(x))
    index = np.random.choice(len(x), n, replace=False)
    target = np.random.choice(wrong, n)
    new_x[index] = target
    return new_x

def generate_from_pattern(pattern:str):
    gen=""
    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"\
              "T", "U", "V", "W", "X", "Y", "Z"]
    for p in pattern:
        if p == "#":
            chr = numbers[random.randint(0, len(numbers)-1)]
        elif p == "$":
            chr = letters[random.randint(0, len(letters)-1)]
        else: chr = p
        gen = gen + chr
    return gen

def choose_once_from(choice):
    return choice[random.randint(0, len(choice)-1)]

def generate_float_to_precision(mx:float, p:int):
#    f = round(random.lognormvariate(0, 1)*mx /3, 2)
    f = round(random.gammavariate(1, 0.5)*mx, p)
    return f

def gen_and_flip(x):
    n = generate_float_to_precision(1e6, 2)
    if len(x) > 0:
        if x[-1] == "L": n = -n
    return n
    
def ccy_convert_with_error(x:tuple, ccy_rates_for_aud, p):
    delta = random.normalvariate(0, 0.1)
    if x[1] in ccy_rates_for_aud:
        amount = float(x[0]) * (ccy_rates_for_aud[x[1]] + delta)
    else: amount = -1 * float(x[0])
    return round(amount, p)

**Go**

In [189]:
number_of_records = 10000
nr = number_of_records

record_type = corrupt_some_with(np.random.choice(["B"], nr), 5, [""])

nde_types = ["COL", "MAR", "NONMAR", "ISDA", "CSA"]
nde_type_codes = corrupt_some_with(np.random.choice(nde_types, nr), 5, [""])

saracen_codes  = ["6001", "6280", "9685CU", "2026", "xxxx"]
saracen_code_probabilities = [0.25, 0.2, 0.2, 0.3, 0.05]
legal_entity = corrupt_some_with(np.random.choice(saracen_codes, nr, p=saracen_code_probabilities), 5, [""])
entity_saracen_code = corrupt_some_with(np.random.choice(saracen_codes, nr, p=saracen_code_probabilities), 5, [""])

dates = random_date_range((2000, 1, 1), (2010, 1, 2), nr, "%d/%m/%Y")
dates = corrupt_some_with(dates, 5, ["xx-yy-zzzz", "", "30-02-1900"])

data = np.column_stack((record_type, nde_type_codes, legal_entity, entity_saracen_code, dates))

**Generate csv**

In [190]:
panda = pandas.DataFrame(data)
#np.savetxt("sdi.log", array_data, delimiter="|")
panda.to_csv("sdi.log", header=None, sep="|", index=False)

**First few records**

In [191]:
panda.head()

Unnamed: 0,0,1,2,3,4
0,B,ISDA,6001,9685CU,02/07/2003
1,B,,9685CU,xxxx,13/11/2009
2,B,COL,9685CU,6280,12/11/2009
3,B,ISDA,6280,9685CU,11/05/2009
4,B,NONMAR,6001,2026,09/06/2002


**Last few records**

In [192]:
panda.tail()

Unnamed: 0,0,1,2,3,4
9995,B,COL,2026,6001,06/11/2000
9996,B,NONMAR,2026,6001,11/06/2001
9997,B,NONMAR,2026,9685CU,05/02/2005
9998,B,COL,2026,6280,10/08/2009
9999,B,MAR,9685CU,6280,16/07/2002


In [193]:
%%bash 
cat sdi.log

B|ISDA|6001|9685CU|02/07/2003
B||9685CU|xxxx|13/11/2009
B|COL|9685CU|6280|12/11/2009
B|ISDA|6280|9685CU|11/05/2009
B|NONMAR|6001|2026|09/06/2002
B|ISDA|6001|6280|20/04/2000
B|MAR|2026|2026|07/04/2007
B|COL|2026|6280|18/05/2000
B|ISDA|2026|6001|22/05/2000
B|NONMAR|6001|9685CU|11/10/2003
B||6280|xxxx|15/05/2001
B|CSA|6280|2026|19/07/2006
B|CSA|2026|9685CU|22/03/2007
B|MAR|9685CU|6280|17/01/2009
B||2026||13/02/2003
B|ISDA|2026|6001|10/08/2001
B|COL|2026|9685CU|22/02/2005
B||2026|2026|29/12/2003
B|MAR|6280|2026|14/07/2000
B|NONMAR|6280|2026|19/01/2007
B|NONMAR|6280|6001|22/04/2003
B|CSA|9685CU||07/02/2008
B|MAR|9685CU|9685CU|15/11/2005
B||6001|2026|24/06/2001
B|CSA|9685CU|9685CU|17/05/2008
B|NONMAR|6280|9685CU|21/05/2002
B|NONMAR|2026|9685CU|20/05/2000
B|ISDA|2026|9685CU|30/05/2001
B|MAR|6001||
B|ISDA|6280|6280|22/12/2007
B|CSA|6001|xxxx|11/11/2003
B|COL|9685CU|2026|29/01/2009
B|CSA|6001|6001|14/01/2003
B|ISDA|2026|2026|05/01/2008
B|COL|2026|2026|24/05/2006
B|COL|2026|6280|23/12/2009
B|CSA

**Now for a real example**

![](IMG_3674.JPG)

**Set number of records**

In [194]:
record_count = 100
df = pandas.DataFrame()

**Record type**

In [195]:
record_type = np.random.choice(["B"], record_count)
record_type = corrupt_some_with(record_type, 4, ["","X"])
df["RecordType"] = record_type.astype(str)
df["RecordType"]

0     B
1     B
2     B
3     B
4     B
5     B
6     B
7     B
8     B
9     B
10    B
11    X
12    B
13    B
14    B
15    B
16    B
17    B
18    B
19     
20    B
21    B
22    B
23    B
24    B
25    B
26    B
27    B
28    B
29    B
     ..
70    B
71    B
72    B
73    B
74     
75    B
76    B
77    B
78    B
79    B
80    B
81    B
82    B
83    B
84    B
85    B
86    B
87    B
88    B
89    B
90    B
91    B
92    B
93    B
94    B
95    B
96    B
97    B
98    B
99    B
Name: RecordType, Length: 100, dtype: object

**Legal Entity**

In [196]:
legal_entity = np.random.choice(["HUBAU"], record_count)
legal_entity = corrupt_some_with(legal_entity, 5, ["", "HBEU", "xxx"])
df["LegalEntity"] = legal_entity.astype(str)
df.head()

Unnamed: 0,RecordType,LegalEntity
0,B,HUBAU
1,B,HUBAU
2,B,HUBAU
3,B,
4,B,HUBAU


In [197]:
df.describe()

Unnamed: 0,RecordType,LegalEntity
count,100,100
unique,3,3
top,B,HUBAU
freq,96,95


**Moving faster**

In [198]:
SaracenCode = np.random.choice(["4435"], record_count)
SaracenCode = corrupt_some_with(SaracenCode, 5, [""])
df["SaracenCode"] = SaracenCode.astype(str)

Unknown1 = np.random.choice([""], record_count)
df["Unknown1"] = Unknown1.astype(str) 

ReportingEntity = np.random.choice(["AUHBAP"], record_count)
ReportingEntity = corrupt_some_with (ReportingEntity, 5, ["","AHUBAP"])
df["ReportingEntity"] = ReportingEntity.astype(str)

Account = list(map(lambda x: "0"+generate_from_pattern("####")+"-"+choose_once_from(["A", "L"]), record_type))
Account = corrupt_some_with(Account, 5, ["", "x9999-L", "03400-x"])
df["Account"] = Account.astype(str)

OriginalCCY = np.random.choice(["AUD", "SGD", "CNY"], record_count, p=[0.8, 0.1, 0.1])
OriginalCCY = corrupt_some_with(OriginalCCY, 5, ["", "xxx"])
df["OriginalCCY"] = OriginalCCY.astype(str)

AmountInOriginalCCY = list(map(lambda x: gen_and_flip(x), Account))
AmountInOriginalCCY = corrupt_some_with(AmountInOriginalCCY, 5, ["-999.990"])
df["AmountInOriginalCCY"] = AmountInOriginalCCY.astype(float)

BalanceType = list(map(lambda x: "B", record_type))
BalanceType = corrupt_some_with(BalanceType, 5, ["", "x"])
df["BalanceType"] = BalanceType.astype(str)

Unknown2 = list(map(lambda x: generate_from_pattern("$$#####"), record_type))
Unknown2 = corrupt_some_with(Unknown2, 5, ["", "AA00000"])
df["Unknown2"] = Unknown2.astype(str)

ccy_rates_for_aud = {
    "AUD": 1,
    "CNY": 5.23,
    "SGD": 1.08
}
AmountInReportingCCY = np.asarray(list(map(lambda x: ccy_convert_with_error(x, ccy_rates_for_aud, 3), \
                                           list(zip(AmountInOriginalCCY, OriginalCCY)))))
df["AmountInReportingCCY"] = AmountInReportingCCY.astype(float)

ReportingCCY = np.random.choice(["AUD"], record_count)
ReportingCCY = corrupt_some_with(ReportingCCY, 5, [""])
df["ReportingCCY"] = ReportingCCY.astype(str)

df

Unnamed: 0,RecordType,LegalEntity,SaracenCode,Unknown1,ReportingEntity,Account,OriginalCCY,AmountInOriginalCCY,BalanceType,Unknown2,AmountInReportingCCY,ReportingCCY
0,B,HUBAU,4435,,,08576-A,AUD,23103.64,B,GM26894,19800.599,AUD
1,B,HUBAU,4435,,AUHBAP,06568-L,AUD,-234470.63,B,VY45277,-251994.012,AUD
2,B,HUBAU,4435,,AUHBAP,03400-x,AUD,171381.94,B,YO53316,169606.564,
3,B,,4435,,AUHBAP,06218-A,AUD,267909.15,B,AA00000,247551.907,AUD
4,B,HUBAU,4435,,AUHBAP,02293-L,AUD,-2074117.05,B,CC54014,-1813842.820,AUD
5,B,HUBAU,4435,,AUHBAP,06941-L,AUD,-32063.26,x,IE83926,-32423.995,AUD
6,B,HUBAU,4435,,AUHBAP,08257-L,AUD,-995084.91,B,OO30695,-1107935.140,AUD
7,B,HUBAU,4435,,AUHBAP,05286-L,AUD,-645657.54,B,VQ76475,-627903.619,AUD
8,B,HUBAU,4435,,AUHBAP,08567-A,AUD,459878.63,B,XE42859,428645.046,AUD
9,B,HBEU,,,AUHBAP,06681-A,AUD,18388.37,B,VF09775,25275.632,AUD


![](IMG_3674.JPG)

**Create the file**

In [199]:
df.to_csv("ledger.sdi.log", header=None, sep="|", index=False)
df.to_csv("enhanced.ledger.sdi.log", sep="|", index=False)

## Wait there's more!...##

**How about formatting it as AVRO**?

**Initial Avro schema**

In [200]:
initial_schema = {
    "namespace": "example.avro",
    "type": "record",
    "name": "ledger",
    "fields": [
        {"name": "RecordType", "type": ["string", "null"]},
        {"name": "SystemId", "type": ["string", "null"]},
        {"name": "SaracenCode", "type": ["string", "null"]},
        {"name": "Unknown1", "type": ["string", "null"]},
        {"name": "ReportingEntity", "type": ["string", "null"]},
        {"name": "Account", "type": ["string", "null"]},
        {"name": "OriginalCCY", "type": ["string", "null"]},
        {"name": "AmountInOriginalCCY", "type": ["float", "null"]},
        {"name": "BalanceType", "type": ["string", "null"]},
        {"name": "Unknown2", "type": ["string", "null"]},
        {"name": "AmountInReportingCCY", "type": ["float", "null"]},
        {"name": "ReportingCCY", "type": ["string", "null"]}
    ]
}

**Write Avro with initial schema**

In [201]:
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

In [202]:
schema = avro.schema.Parse(json.dumps(initial_schema))
writer = DataFileWriter(open("ledger.sdi.avro", "wb"), DatumWriter(), schema)

for _, row in df.iterrows():
    dict = row.to_dict()
    writer.append(dict)
writer.close()

**Read the avro file**

In [203]:
reader = DataFileReader(open("ledger.sdi.avro", "rb"), DatumReader())
avro_data = []  # the whole structure isn't needed in memory
for data in reader:
    avro_data.append(data)
reader.close()
df1 = pandas.DataFrame.from_records(avro_data)
df1.head()

Unnamed: 0,Account,AmountInOriginalCCY,AmountInReportingCCY,BalanceType,OriginalCCY,RecordType,ReportingCCY,ReportingEntity,SaracenCode,SystemId,Unknown1,Unknown2
0,08576-A,23103.64,19800.6,B,AUD,B,AUD,,4435,,,GM26894
1,06568-L,-234470.6,-251994.0,B,AUD,B,AUD,AUHBAP,4435,,,VY45277
2,03400-x,171381.9,169606.6,B,AUD,B,,AUHBAP,4435,,,YO53316
3,06218-A,267909.2,247551.9,B,AUD,B,AUD,AUHBAP,4435,,,AA00000
4,02293-L,-2074117.0,-1813843.0,B,AUD,B,AUD,AUHBAP,4435,,,CC54014


In [204]:
df1.describe()

Unnamed: 0,AmountInOriginalCCY,AmountInReportingCCY
count,100.0,100.0
mean,-22427.45,6092.952
std,792530.5,890329.6
min,-3729006.0,-3449462.0
25%,-273428.7,-293960.4
50%,13441.28,22538.12
75%,340921.4,349599.8
max,2203936.0,3596641.0


In [205]:
df1.shape

(100, 12)

In [206]:
df1.Account.describe()

count         100
unique         98
top       x9999-L
freq            2
Name: Account, dtype: object

# Back to Pandas and the csv#

**Read csv and inspect result**

In [208]:
data = pandas.DataFrame.from_csv("enhanced.ledger.sdi.log", sep="|")
data.describe()

Unnamed: 0,SaracenCode,Unknown1,AmountInOriginalCCY,AmountInReportingCCY
count,95.0,0.0,100.0,100.0
mean,4435.0,,-22427.45,6092.955
std,0.0,,792530.5,890329.6
min,4435.0,,-3729007.0,-3449462.0
25%,4435.0,,-273428.7,-293960.3
50%,4435.0,,13441.28,22538.12
75%,4435.0,,340921.4,349599.8
max,4435.0,,2203937.0,3596641.0


In [209]:
data.head()

Unnamed: 0_level_0,LegalEntity,SaracenCode,Unknown1,ReportingEntity,Account,OriginalCCY,AmountInOriginalCCY,BalanceType,Unknown2,AmountInReportingCCY,ReportingCCY
RecordType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
B,HUBAU,4435.0,,,08576-A,AUD,23103.64,B,GM26894,19800.599,AUD
B,HUBAU,4435.0,,AUHBAP,06568-L,AUD,-234470.63,B,VY45277,-251994.012,AUD
B,HUBAU,4435.0,,AUHBAP,03400-x,AUD,171381.94,B,YO53316,169606.564,
B,,4435.0,,AUHBAP,06218-A,AUD,267909.15,B,AA00000,247551.907,AUD
B,HUBAU,4435.0,,AUHBAP,02293-L,AUD,-2074117.05,B,CC54014,-1813842.82,AUD
