In [80]:
import pandas as pd
from faker import Faker
import random
import math

## Load the data from data source

In [81]:
df = pd.read_csv('./finance.csv')

In [82]:
df.head(5)

Unnamed: 0,customer,email,age,gender,merchant,category,amount,fraud,timestamp
0,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,35.13,0,2023-01-01 00:00:00
1,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,27.63,0,2023-01-01 08:00:00
2,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,13.46,0,2023-01-01 16:00:00
3,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,28.86,0,2023-01-02 00:00:00
4,C100045114,nmontgomery@example.net,4,M,M151143676,barsandrestaurants,64.99,0,2023-01-02 08:00:00


In [83]:
len(df)

33688

In [84]:
len(df[df['fraud'] == 1])

524

## Goal: Need 50000 rows of synthetic data, with higher fraud and privacy protection

### Step 1: clean data

In [85]:
df = df.dropna()

In [86]:
df = df[df['age'].str.isnumeric()]
df['age'] = df['age'].astype(int)

In [87]:
df['amount'] = df['amount'].astype(float)

In [88]:
df = df[df['gender'].isin(['M', 'F'])]

### Step 2: create a mapping for dependent fields: customer id and email 

In [89]:
# create a zip of customer id and email
customer = dict(zip(df['customer'], df['email']))

In [90]:
for i, c in enumerate(customer.keys()):
    customer[c] = [customer[c]] + [{'id': i}]

In [91]:
customer

{'C100045114': ['nmontgomery@example.net', {'id': 0}],
 'C1000699316': ['knightlauren@example.org', {'id': 1}],
 'C1002759277': ['morgan84@example.net', {'id': 2}],
 'C1004109477': ['smithscott@example.com', {'id': 3}],
 'C1004300450': ['shaunallison@example.net', {'id': 4}],
 'C1005126300': ['andrea42@example.net', {'id': 5}],
 'C1005495267': ['masonjames@example.net', {'id': 6}],
 'C1005806982': ['bmills@example.net', {'id': 7}],
 'C1006176917': ['todd26@example.org', {'id': 8}],
 'C1007572087': ['chris28@example.com', {'id': 9}],
 'C1007790716': ['wallerbrandon@example.net', {'id': 10}],
 'C1008918174': ['isimmons@example.net', {'id': 11}],
 'C1009080922': ['jessicafigueroa@example.net', {'id': 12}],
 'C100992504': ['marywilson@example.net', {'id': 13}],
 'C10105795': ['mckenzieerickson@example.org', {'id': 14}],
 'C1010589026': ['zacharysantos@example.net', {'id': 15}],
 'C1010865894': ['keith50@example.net', {'id': 16}],
 'C1010936270': ['codydeleon@example.net', {'id': 17}],
 'C1

### Step 3: create a mapping for dependent fields: merchant and category

In [92]:
merchant = dict(zip(df['merchant'], df['category']))

In [93]:
for i, m in enumerate(merchant.keys()):
    merchant[m] = [merchant[m]] + [{'id': i}]

In [94]:
merchant

{'M348934600': ['transportation', {'id': 0}],
 'M151143676': ['barsandrestaurants', {'id': 1}],
 'M1198415165': ['wellnessandbeauty', {'id': 2}],
 'M480139044': ['health', {'id': 3}],
 'M1823072687': ['transportation', {'id': 4}],
 'M85975013': ['food', {'id': 5}],
 'M209847108': ['wellnessandbeauty', {'id': 6}],
 'M855959430': ['hyper', {'id': 7}],
 'M1535107174': ['wellnessandbeauty', {'id': 8}],
 'M1600850729': ['fashion', {'id': 9}],
 'M349281107': ['fashion', {'id': 10}],
 'M1946091778': ['wellnessandbeauty', {'id': 11}],
 'M1872033263': ['home', {'id': 12}],
 'M1888755466': ['otherservices', {'id': 13}],
 'M348875670': ['hotelservices', {'id': 14}],
 'M840466850': ['tech', {'id': 15}],
 'M1053599405': ['health', {'id': 16}],
 'M547558035': ['fashion', {'id': 17}],
 'M2122776122': ['home', {'id': 18}],
 'M923029380': ['home', {'id': 19}],
 'M692898500': ['health', {'id': 20}],
 'M1649169323': ['sportsandtoys', {'id': 21}],
 'M980657600': ['sportsandtoys', {'id': 22}],
 'M131368696

### Step 4: Analyse the min-max numerical ID for the customer, merchant, age, and amount

In [95]:
pd.Series(customer.keys()).apply(lambda x: len(x)).min()

8

In [18]:
pd.Series(customer.keys()).apply(lambda x: len(x)).max()

11

In [19]:
pd.Series(merchant.keys()).apply(lambda x: len(x)).min()

8

In [20]:
pd.Series(merchant.keys()).apply(lambda x: len(x)).max()

11

In [21]:
df.age.min(), df.age.max()

(0, 6)

In [22]:
df.amount.min(), df.amount.max()

(0.0, 6888.3)

In [56]:
df.timestamp.min(), df.timestamp.max()

('2023-01-01 00:00:00', '2023-01-26 08:42:03.889555822')

In [96]:
customer_range = pd.Series(customer.keys()).apply(lambda x: len(x)).min(), pd.Series(customer.keys()).apply(lambda x: len(x)).max()
customer_range=[math.pow(10,x) for x in customer_range]

In [97]:
merchant_range = pd.Series(merchant.keys()).apply(lambda x: len(x)).min(), pd.Series(merchant.keys()).apply(lambda x: len(x)).max()
merchant_range = [math.pow(10,x) for x in merchant_range]

In [25]:
age_range = df.age.min(), df.age.max()

In [26]:
amount_range = df.amount.min(), df.amount.max()

In [57]:
timestamp_range = df.timestamp.min(), df.timestamp.max()

In [58]:
customer_range, merchant_range, age_range, amount_range, timestamp_range

((8, 11),
 (8, 11),
 (0, 6),
 (0.0, 6888.3),
 ('2023-01-01 00:00:00', '2023-01-26 08:42:03.889555822'))

### Step 5: Generate 50000 rows of syn data

In [28]:
syn_df = pd.DataFrame(columns=df.columns, index=range(50000))

In [29]:
syn_df

Unnamed: 0,customer,email,age,gender,merchant,category,amount,fraud,timestamp
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
49995,,,,,,,,,
49996,,,,,,,,,
49997,,,,,,,,,
49998,,,,,,,,,


### Step 6: Generate random data for dependent fields using original mapping

In [74]:
total_customers = max([x[1]['id'] for x in customer.values()])+1

In [75]:
total_merchants = max([x[1]['id'] for x in merchant.values()]) +1

In [77]:
fake = Faker()

In [103]:
syn_customer = {
    'id': [f'C{round(random.uniform(*customer_range))}' for _ in range(50000)],
    'email': [fake.email() for _ in range(50000)]
}

In [104]:
syn_merchant = {
    'id': [f'M{round(random.uniform(*merchant_range))}' for _ in range(50000)],
    'category': [random.choice(tuple([x[0] for x in merchant.values()])) for _ in range(50000)]
}

### Step 7: Generate random data for each field

In [36]:
age = [round(random.uniform(*age_range)) for _ in range(50000)]

In [37]:
gender = [random.choice(['M', 'F']) for _ in range(50000)]

In [44]:
amount = [random.uniform(*amount_range) for _ in range(50000)]

In [54]:
fraud = random.choices([0, 1], weights=[0.15, 0.85],k=50000)

In [67]:
timestamp = pd.date_range(*timestamp_range, 50000)

### Step 8: Combine all the data

In [105]:
syn_df['customer'] = syn_customer['id']
syn_df['email'] = syn_customer['email']
syn_df['age'] = age
syn_df['gender'] = gender
syn_df['merchant'] = syn_merchant['id']
syn_df['category'] = syn_merchant['category']
syn_df['amount'] = amount
syn_df['fraud'] = fraud
syn_df['timestamp'] = timestamp

In [106]:
syn_df

Unnamed: 0,customer,email,age,gender,merchant,category,amount,fraud,timestamp
0,C77395562331,ingramjoseph@example.org,2,M,M94843641225,wellnessandbeauty,6407.074444,0,2023-01-01 00:00:00.000000000
1,C67914289182,dcompton@example.org,1,F,M79408136014,leisure,1324.917208,1,2023-01-01 00:00:43.827354338
2,C42794279423,penacharles@example.net,4,M,M4860306650,wellnessandbeauty,3094.339417,0,2023-01-01 00:01:27.654708676
3,C44216027936,william89@example.net,1,F,M47437737467,tech,1665.283673,1,2023-01-01 00:02:11.482063014
4,C31997663343,barnettmichaela@example.org,1,F,M25372697670,contents,3130.694459,1,2023-01-01 00:02:55.309417352
...,...,...,...,...,...,...,...,...,...
49995,C99510420281,tarablack@example.org,1,F,M27691809714,wellnessandbeauty,2656.259104,1,2023-01-26 08:39:08.580138469
49996,C45559484738,thomasashley@example.org,0,M,M34564484727,hotelservices,1451.187341,1,2023-01-26 08:39:52.407492807
49997,C45331207584,edward63@example.org,6,M,M72097655547,travel,4966.442921,1,2023-01-26 08:40:36.234847145
49998,C17668889870,samantha92@example.org,5,F,M90065275742,home,2279.201673,1,2023-01-26 08:41:20.062201483
