In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import timedelta

In [3]:
# Generating fake data

faker = Faker()
n = 500000

df = pd.DataFrame(np.arange(n), columns=['Transaction ID'])

# Timestamps
start_date = faker.date_between('-1y','today')
start_timestamp = pd.Timestamp(start_date) 
df['Timestamp'] = start_timestamp + pd.to_timedelta(np.random.randint(0,364,n), unit='d')

# Amount
df['Amount'] = np.random.normal(50,20,n)

# Merchants
merchants = [faker.company() for i in range(100)]
df['Merchants'] = np.random.choice(merchants, n)

# Location
locations = [faker.city() for i in range(50)]
df['Location'] = np.random.choice(locations, n)

# Customer ID
df['Customer ID'] = np.random.randint(1, 50000, n)

# Fraud
df['Fraud'] = np.random.choice([0,1], n, p=[0.95,0.05])

# Add Noise
df['Amount'] = df['Amount'] * np.random.normal(1,0.1,n)

In [4]:
print(df.head())

   Transaction ID  Timestamp     Amount                    Merchants  \
0               0 2024-05-25  56.278477  Martinez, Nolan and Garrett   
1               1 2023-10-06  34.128422  Martinez, Nolan and Garrett   
2               2 2024-04-14  41.022213     Case, Fields and Herring   
3               3 2023-10-18  22.616066                  Taylor-Leon   
4               4 2023-10-09  16.698159     Case, Fields and Herring   

             Location  Customer ID  Fraud  
0   Lake Brandonmouth        29459      0  
1           North Amy        23874      0  
2       Lake Judybury        20749      0  
3          New Pamela        45546      0  
4  North Kimberlystad        34586      1  


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Transaction ID  500000 non-null  int32         
 1   Timestamp       500000 non-null  datetime64[ns]
 2   Amount          500000 non-null  float64       
 3   Merchants       500000 non-null  object        
 4   Location        500000 non-null  object        
 5   Customer ID     500000 non-null  int32         
 6   Fraud           500000 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(3), object(2)
memory usage: 21.0+ MB


In [6]:
file_name = "transactions.csv"
df.to_csv(file_name, index=False)

In [11]:
df[df['Fraud']==1]

Unnamed: 0,Transaction ID,Timestamp,Amount,Merchants,Location,Customer ID,Fraud
4,4,2023-10-09,16.698159,"Case, Fields and Herring",North Kimberlystad,34586,1
8,8,2023-06-23,47.422570,Thompson-Jones,Longchester,10424,1
25,25,2023-09-22,57.722784,Nguyen PLC,Crosbymouth,14936,1
34,34,2023-11-28,36.072375,Burton LLC,Kellyshire,22702,1
39,39,2023-09-14,56.134302,Benson-Santiago,Longchester,32805,1
...,...,...,...,...,...,...,...
499962,499962,2023-08-18,45.204056,Diaz-Tate,Tammyville,3654,1
499974,499974,2023-11-07,23.898166,"Sanchez, Myers and Ingram",North Stephanie,16855,1
499978,499978,2023-11-20,63.540268,Thompson-Jones,North Kimberlystad,45557,1
499992,499992,2023-12-02,24.122283,"Romero, Myers and Mcpherson",Port Josemouth,13989,1
