# Mock Data Creator#

**Setup Helpers**

In [119]:
import numpy as np
import datetime as dt
import pandas
import random
import re

def random_date_range(start_tuple:tuple, end_tuple:tuple, nr:int, format_string:str):
    start_date = dt.date(*start_tuple)
    end_date = dt.date(*end_tuple)
    delta_in_days = (end_date - start_date).days
    if delta_in_days != 0:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))  
    else:
        dates = list(map(lambda x: start_date + dt.timedelta(days=np.random.randint(delta_in_days)), range(nr)))    
    formatted_dates = list(map(lambda x: x.strftime(format_string), dates)) 
    return formatted_dates

def corrupt_some_with(x:list, percent:float, wrong:list):
    '''
    Corrupts x with p percent of records selected from wrong
    '''
    new_x = np.array(x)
    n = int( percent / 100 * len(x))
    index = np.random.choice(len(x), n, replace=False)
    target = np.random.choice(wrong, n)
    new_x[index] = target
    return new_x

def generate_from_pattern(pattern:str):
    gen=""
    length = 0
    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",\
              "T", "U", "V", "W", "X", "Y", "Z"]
    for p in pattern:
        if p == "#":
            ch = numbers[random.randint(0, len(numbers)-1)]
        elif p == "$":
            ch = letters[random.randint(0, len(letters)-1)]
        else: ch = p
        if len(ch) > 1: print(ch)
        gen = gen + ch
        length = length + 1
    if len(pattern) != len(gen): print("GOT IT!!!!!", pattern, gen, length) 
    return gen

def choose_once_from(choice):
    return choice[random.randint(0, len(choice)-1)]

def generate_float_to_precision(mx:float, p:int):
#    f = round(random.lognormvariate(0, 1)*mx /3, 2)
    f = round(random.gammavariate(1, 0.5)*mx, p)
    return f

def gen_and_flip(x):
    n = generate_float_to_precision(1e6, 2)
    if len(x) > 0:
        if x[-1] == "L": n = -n
    return n
    
def ccy_convert_with_error(x:tuple, ccy_rates_for_aud, p):
    delta = random.normalvariate(0, 0.1)
    if x[1] in ccy_rates_for_aud:
        amount = float(x[0]) * (ccy_rates_for_aud[x[1]] + delta)
    else: amount = -1 * float(x[0])
    return round(amount, p)

**Now for a real example**

![](IMG_3674.JPG)

**Set number of records**

In [120]:
record_count = 100
df = pandas.DataFrame()

**Record type**

In [121]:
RecordType = np.random.choice(["B"], record_count)
RecordType = corrupt_some_with(RecordType, 4, ["","X"])
df["RecordType"] = RecordType.astype(str)
df["RecordType"]

0     B
1     B
2     B
3     B
4     B
5      
6     B
7     B
8     B
9     B
10    B
11    B
12    B
13    B
14    B
15    B
16    B
17    B
18    B
19    B
20    B
21    B
22    B
23    B
24    X
25    B
26    B
27    B
28    B
29    B
     ..
70    B
71    B
72    B
73    B
74    B
75    B
76    B
77    B
78    B
79    B
80     
81    B
82     
83    B
84    B
85    B
86    B
87    B
88    B
89    B
90    B
91    B
92    B
93    B
94    B
95    B
96    B
97    B
98    B
99    B
Name: RecordType, Length: 100, dtype: object

**Source System**

In [122]:
SourceSystemId = np.random.choice(["HUBAU"], record_count)
SourceSystemId = corrupt_some_with(SourceSystemId, 5, ["", "HBEU", "xxx"])
df["SourceSystemId"] = SourceSystemId.astype(str)
df.head()

Unnamed: 0,RecordType,SourceSystemId
0,B,HUBAU
1,B,HUBAU
2,B,HUBAU
3,B,HUBAU
4,B,HUBAU


In [123]:
df.describe()

Unnamed: 0,RecordType,SourceSystemId
count,100,100
unique,3,3
top,B,HUBAU
freq,96,95


**Moving faster**

In [124]:
LegalEntity = np.random.choice(["'4435'"], record_count)
LegalEntity = corrupt_some_with(LegalEntity, 5, [""])
df["LegalEntity"] = LegalEntity.astype(str)

ReportingEntity = list(map(lambda x: "", range(record_count)))
df["ReportingEntity"] = ReportingEntity 

CostCentre = np.random.choice(["AUHBAP"], record_count)
CostCentre = corrupt_some_with (CostCentre, 5, ["","AHUBAP"])
df["CostCentre"] = CostCentre.astype(str)

GLKey = list(map(lambda x: "0"+generate_from_pattern("####")+"-"+choose_once_from(["A", "L"]), RecordType))
GLKey = corrupt_some_with(GLKey, 5, ["", "x9999-L", "03400-x"])
df["GLKey"] = GLKey.astype(str)

GLBalanceCurrencyCode = np.random.choice(["AUD", "SGD", "CNY"], record_count, p=[0.8, 0.1, 0.1])
GLBalanceCurrencyCode = corrupt_some_with(GLBalanceCurrencyCode, 5, ["", "xxx"])
df["GLBalanceCurrencyCode"] = GLBalanceCurrencyCode.astype(str)

GLBalance = list(map(lambda x: gen_and_flip(x), GLKey))
GLBalance = corrupt_some_with(GLBalance, 5, ["-999.990"])
df["GLBalance"] = GLBalance.astype(float)

BankingORTradingBook = np.random.choice(["B", "T"], record_count)
BankingORTradingBook = corrupt_some_with(BankingORTradingBook, 5, ["", "x"])
df["BankingORTradingBook"] = BankingORTradingBook.astype(str)

GroupReconciliationKey = list(map(lambda x: generate_from_pattern("$$#####"), RecordType))
GroupReconciliationKey = corrupt_some_with(GroupReconciliationKey, 5, ["", "AA00"])
df["GroupReconciliationKey"] = GroupReconciliationKey.astype(str)

ccy_rates_for_aud = {
    "AUD": 1,
    "CNY": 5.23,
    "SGD": 1.08
}
GLBalanceInReportingCCY = np.asarray(list(map(lambda x: ccy_convert_with_error(x, ccy_rates_for_aud, 3), \
                                           list(zip(GLBalance, GLBalanceCurrencyCode)))))
df["GLBalanceInReportingCCY"] = GLBalanceInReportingCCY.astype(float)

ReportingBalanceCurrencyCode = np.random.choice(["AUD"], record_count)
ReportingBalanceCurrencyCode = corrupt_some_with(ReportingBalanceCurrencyCode, 5, ["","xxx"])
df["ReportingBalanceCurrencyCode"] = ReportingBalanceCurrencyCode.astype(str)

df

Unnamed: 0,RecordType,SourceSystemId,LegalEntity,ReportingEntity,CostCentre,GLKey,GLBalanceCurrencyCode,GLBalance,BankingORTradingBook,GroupReconciliationKey,GLBalanceInReportingCCY,ReportingBalanceCurrencyCode
0,B,HUBAU,'4435',,AUHBAP,06764-A,AUD,326172.28,T,AO27875,367922.137,AUD
1,B,HUBAU,'4435',,AUHBAP,03404-A,AUD,-999.99,T,AS40306,-950.623,AUD
2,B,HUBAU,,,AUHBAP,06413-L,AUD,-251780.78,T,ZZ41246,-266514.437,AUD
3,B,HUBAU,'4435',,AUHBAP,08116-L,AUD,-43457.35,T,WT99771,-49510.600,AUD
4,B,HUBAU,'4435',,AUHBAP,00690-A,CNY,235924.21,T,HC43842,1236167.478,AUD
5,,HUBAU,'4435',,AUHBAP,06758-L,AUD,-164781.37,T,XA97065,-159472.647,AUD
6,B,HUBAU,,,AUHBAP,00366-L,AUD,-158247.19,T,FD88378,-184802.748,AUD
7,B,HUBAU,'4435',,AUHBAP,08216-L,AUD,-775019.11,T,CW42518,-927775.152,AUD
8,B,HBEU,'4435',,AUHBAP,05609-L,AUD,-66798.14,T,FC05267,-57044.519,AUD
9,B,HUBAU,'4435',,AUHBAP,,xxx,108041.28,B,KU41120,-108041.280,AUD


![](IMG_3674.JPG)

**Create the file**

In [125]:
df.to_csv("ledger.sdi.log", header=None, sep="|", index=False)
df.to_csv("enhanced.ledger.sdi.log", sep="|", index=False)

## Wait there's more!...##

**How about formatting it as AVRO**?

**Initial Avro schema**

In [126]:
initial_schema = {
    "namespace": "example.avro",
    "type": "record",
    "name": "ledger",
    "fields": [
        {"name": "RecordType", "type": ["string", "null"]},
        {"name": "SourceSystemId", "type": ["string", "null"]},
        {"name": "LegalEntity", "type": ["string", "null"]},
        {"name": "ReportingEntity", "type": ["string", "null"]},
        {"name": "CostCentre", "type": ["string", "null"]},
        {"name": "GLKey", "type": ["string", "null"]},
        {"name": "GLBalanceCurrencyCode", "type": ["string", "null"]},
        {"name": "GLBalance", "type": ["float", "null"]},
        {"name": "BankingORTradingBook", "type": ["string", "null"]},
        {"name": "GroupReconciliationKey", "type": ["string", "null"]},
        {"name": "GLBalanceInReportingCCY", "type": ["float", "null"]},
        {"name": "ReportingBalanceCurrencyCode", "type": ["string", "null"]}
    ]
}

**Write Avro with initial schema**

In [127]:
import avro.schema
import json
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

In [128]:
schema = avro.schema.Parse(json.dumps(initial_schema))
writer = DataFileWriter(open("ledger.sdi.avro", "wb"), DatumWriter(), schema)

for _, row in df.iterrows():
    dict = row.to_dict()
    writer.append(dict)
writer.close()

**Read the avro file**

In [129]:
reader = DataFileReader(open("ledger.sdi.avro", "rb"), DatumReader())
avro_data = []  # the whole structure isn't needed in memory
for data in reader:
    avro_data.append(data)
reader.close()
df1 = pandas.DataFrame.from_records(avro_data)
df1.head()

Unnamed: 0,BankingORTradingBook,CostCentre,GLBalance,GLBalanceCurrencyCode,GLBalanceInReportingCCY,GLKey,GroupReconciliationKey,LegalEntity,RecordType,ReportingBalanceCurrencyCode,ReportingEntity,SourceSystemId
0,T,AUHBAP,326172.28125,AUD,367922.1,06764-A,AO27875,'4435',B,AUD,,HUBAU
1,T,AUHBAP,-999.98999,AUD,-950.623,03404-A,AS40306,'4435',B,AUD,,HUBAU
2,T,AUHBAP,-251780.78125,AUD,-266514.4,06413-L,ZZ41246,,B,AUD,,HUBAU
3,T,AUHBAP,-43457.351562,AUD,-49510.6,08116-L,WT99771,'4435',B,AUD,,HUBAU
4,T,AUHBAP,235924.203125,CNY,1236168.0,00690-A,HC43842,'4435',B,AUD,,HUBAU


In [130]:
df1.describe()

Unnamed: 0,GLBalance,GLBalanceInReportingCCY
count,100.0,100.0
mean,24097.09,112324.0
std,621748.7,1458348.0
min,-1962756.0,-3163277.0
25%,-254628.2,-289190.8
50%,-38944.55,-38828.59
75%,225618.9,282629.1
max,2070709.0,11050480.0


In [131]:
df1.shape

(100, 12)

In [132]:
df1.GLKey.describe()

count     100
unique     97
top          
freq        3
Name: GLKey, dtype: object

# Back to Pandas and the csv...#

**Refer to DQpy Notebook**