In [1]:
import random
import numpy as np
import pandas as pd
from faker import Faker

In [2]:
faker = Faker()

seed = 42
random.seed(seed)
np.random.seed(seed)
Faker.seed(seed)

In [3]:
df = pd.read_csv("online_retail_sales_dataset.csv")
df.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,customer_gender,customer_location,total_amount
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,Female,North America,636.33
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,Other,South America,1119.54
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,Other,North America,416.87
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,Other,Australia,705.6
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,Female,Australia,2690.82


In [4]:
# Fix duplicate age and gender of the same customer in dataset
customer_ages = df.groupby("customer_id")["customer_age"].first().to_dict()
customer_genders = df.groupby("customer_id")["customer_gender"].first().to_dict()
df["customer_age"] = df["customer_id"].map(lambda cust_id: customer_ages[cust_id])
df["customer_gender"] = df["customer_id"].map(lambda cust_id: customer_genders[cust_id])

customer_names = {}
for customer_id in df["customer_id"].unique():
    customer_names[customer_id] = {
        "first_name": faker.first_name(),
        "last_name": faker.last_name(),
    }

df_1 = df.copy()
df_1["customer_first_name"] = df_1["customer_id"].map(lambda cust_id: customer_names[cust_id]["first_name"])
df_1["customer_last_name"] = df_1["customer_id"].map(lambda cust_id: customer_names[cust_id]["last_name"])
df_1.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,customer_gender,customer_location,total_amount,customer_first_name,customer_last_name
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,Female,North America,636.33,Danielle,Johnson
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,Other,South America,1119.54,Joshua,Walker
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,Other,North America,416.87,Jill,Rhodes
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,Other,Australia,705.6,Patricia,Miller
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,Female,Australia,2690.82,Robert,Johnson


In [5]:
category_subcategories = {
    "Beauty & Personal Care": [
        "Skincare",
        "Haircare",
        "Makeup",
        "Fragrances",
        "Bath & Body",
        "Personal Hygiene",
    ],
    "Books": [
        "Fiction",
        "Non-Fiction",
        "Science",
        "History",
        "Biography",
        "Children Literature",
        "Academic",
    ],
    "Clothing": [
        "Shirts",
        "Pants",
        "Dresses",
        "Outerwear",
        "Activewear",
        "Underwear",
        "Accessories",
    ],
    "Sports & Outdoors": [
        "Fitness Equipment",
        "Camping Gear",
        "Team Sports",
        "Water Sports",
        "Hiking",
        "Cycling",
    ],
    "Home & Kitchen": [
        "Cookware",
        "Appliances",
        "Furniture",
        "Bedding",
        "Storage",
        "Decor",
    ],
    "Electronics": [
        "Smartphones",
        "Laptops",
        "Audio",
        "Gaming",
        "Cameras",
        "Accessories",
    ],
}

product_subcategories = {}
for product_id in df_1["product_id"].unique():
    main_category = df_1[df_1["product_id"] == product_id]["product_category"].iloc[0]
    sub_category = faker.random_element(category_subcategories[main_category])
    product_subcategories[product_id] = sub_category

df_2 = df_1.copy()
df_2["product_subcategory"] = df_2["product_id"].map(lambda prod_id: product_subcategories[prod_id])
df_2.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,customer_gender,customer_location,total_amount,customer_first_name,customer_last_name,product_subcategory
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,Female,North America,636.33,Danielle,Johnson,Furniture
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,Other,South America,1119.54,Joshua,Walker,Accessories
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,Other,North America,416.87,Jill,Rhodes,Fragrances
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,Other,Australia,705.6,Patricia,Miller,Outerwear
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,Female,Australia,2690.82,Robert,Johnson,Children Literature


In [6]:
# Fix duplicate categories of the same product in dataset
product_categories = df_2.groupby("product_id")["product_category"].first().to_dict()
df_2["product_category"] = df_2["product_id"].map(lambda prod_id: product_categories[prod_id])

product_names = {}
for product_id in df_2["product_id"].unique():
    category = df_2[df_2["product_id"] == product_id]["product_category"].iloc[0]
    subcategory = df_2[df_2["product_id"] == product_id]["product_subcategory"].iloc[0]

    if category == "Electronics":
        product_names[product_id] = f"{faker.company()} {subcategory} {faker.word()}".title()
    elif category == "Books":
        product_names[product_id] = f"The {faker.word()} {faker.word()}".title()
    elif category == "Clothing":
        product_names[product_id] = f"{faker.color_name()} {subcategory} {faker.word()}".title()
    elif category == "Beauty & Personal Care":
        product_names[product_id] = f"{faker.company()} {subcategory} {faker.word()}".title()
    elif category == "Sports & Outdoors":
        product_names[product_id] = f"{faker.company()} {subcategory} {faker.word()}".title()
    elif category == "Home & Kitchen":
        product_names[product_id] = f"{faker.company()} {subcategory} {faker.word()}".title()

df_3 = df_2.copy()
df_3["product_name"] = df_3["product_id"].map(lambda prod_id: product_names[prod_id])
df_3.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,customer_gender,customer_location,total_amount,customer_first_name,customer_last_name,product_subcategory,product_name
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,Female,North America,636.33,Danielle,Johnson,Furniture,Perez And Sons Furniture Institution
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,Other,South America,1119.54,Joshua,Walker,Accessories,Blue Accessories Eye
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,Other,North America,416.87,Jill,Rhodes,Fragrances,Hickman Group Fragrances Affect
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,Other,Australia,705.6,Patricia,Miller,Outerwear,Green Outerwear Benefit
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,Female,Australia,2690.82,Robert,Johnson,Children Literature,The Treatment Approach


In [7]:
location_mapping = {
    "Australia": ["Australia", "New Zealand"],
    "South America": ["Brazil", "Argentina", "Chile", "Colombia", "Peru"],
    "Europe": ["Germany", "France", "UK", "Italy", "Spain", "Netherlands"],
    "Africa": ["South Africa", "Nigeria", "Kenya", "Egypt", "Morocco"],
    "North America": ["USA", "Canada", "Mexico"],
    "Asia": ["Japan", "China", "South Korea", "India", "Singapore"],
}

detailed_locations = {}
for region in location_mapping:
    detailed_locations[region] = {}
    for country in location_mapping[region]:
        cities = [faker.city() for _ in range(5)]
        detailed_locations[region][country] = {}
        for city in cities:
            detailed_locations[region][country][city] = [faker.street_address() for _ in range(2)]

df_4 = df_3.copy()
df_4["customer_country"] = df_4[["customer_location"]].apply(
    lambda row: faker.random_element(location_mapping[row["customer_location"]]), axis=1
)
df_4["customer_city"] = df_4[["customer_location", "customer_country"]].apply(
    lambda row: faker.random_element(detailed_locations[row["customer_location"]][row["customer_country"]].keys()), axis=1,
)
df_4["customer_address"] = df_4[["customer_location", "customer_country", "customer_city"]].apply(
    lambda row: faker.random_element(
        detailed_locations[row["customer_location"]][row["customer_country"]][row["customer_city"]]), axis=1,
)
df_4.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,customer_gender,customer_location,total_amount,customer_first_name,customer_last_name,product_subcategory,product_name,customer_country,customer_city,customer_address
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,Female,North America,636.33,Danielle,Johnson,Furniture,Perez And Sons Furniture Institution,USA,South Perryborough,459 Nicole Centers
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,Other,South America,1119.54,Joshua,Walker,Accessories,Blue Accessories Eye,Brazil,North Randyborough,8195 John Stravenue
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,Other,North America,416.87,Jill,Rhodes,Fragrances,Hickman Group Fragrances Affect,Canada,Mayfurt,1958 Randy Drive
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,Other,Australia,705.6,Patricia,Miller,Outerwear,Green Outerwear Benefit,New Zealand,Jessicaside,64248 Cassandra Falls Suite 406
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,Female,Australia,2690.82,Robert,Johnson,Children Literature,The Treatment Approach,New Zealand,East Jessicafort,2773 Emily Glens Suite 091


In [8]:
category_manufacturers = {
    "Beauty & Personal Care": [
        "L'Oreal",
        "Estee Lauder",
        "Procter & Gamble",
        "Johnson & Johnson",
        "Unilever",
        "Shiseido",
        "Revlon",
    ],
    "Books": [
        "Penguin Random House",
        "HarperCollins",
        "Simon & Schuster",
        "Macmillan",
        "Hachette",
        "Scholastic",
    ],
    "Clothing": [
        "Nike",
        "Adidas",
        "H&M",
        "Zara",
        "Gap",
        "Under Armour",
        "Ralph Lauren",
        "Levi's",
    ],
    "Sports & Outdoors": [
        "Columbia",
        "The North Face",
        "Patagonia",
        "REI",
        "Coleman",
        "Wilson",
        "Callaway",
    ],
    "Home & Kitchen": [
        "KitchenAid",
        "Whirlpool",
        "Bosch",
        "Philips",
        "Dyson",
        "Cuisinart",
        "Breville",
    ],
    "Electronics": [
        "Samsung",
        "Apple",
        "Sony",
        "LG",
        "Dell",
        "HP",
        "Lenovo",
        "Asus"],
}

product_manufacturers = {}
for product_id in df_4["product_id"].unique():
    category = df_4[df_4["product_id"] == product_id]["product_category"].iloc[0]
    manufacturer = faker.random_element(category_manufacturers[category])
    product_manufacturers[product_id] = manufacturer

df_5 = df_4.copy()
df_5["product_manufacturer"] = df_5["product_id"].map(lambda prod_id: product_manufacturers[prod_id])
df_5.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,...,customer_location,total_amount,customer_first_name,customer_last_name,product_subcategory,product_name,customer_country,customer_city,customer_address,product_manufacturer
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,...,North America,636.33,Danielle,Johnson,Furniture,Perez And Sons Furniture Institution,USA,South Perryborough,459 Nicole Centers,Bosch
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,...,South America,1119.54,Joshua,Walker,Accessories,Blue Accessories Eye,Brazil,North Randyborough,8195 John Stravenue,Nike
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,...,North America,416.87,Jill,Rhodes,Fragrances,Hickman Group Fragrances Affect,Canada,Mayfurt,1958 Randy Drive,Procter & Gamble
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,...,Australia,705.6,Patricia,Miller,Outerwear,Green Outerwear Benefit,New Zealand,Jessicaside,64248 Cassandra Falls Suite 406,Levi's
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,...,Australia,2690.82,Robert,Johnson,Children Literature,The Treatment Approach,New Zealand,East Jessicafort,2773 Emily Glens Suite 091,HarperCollins


In [9]:
employees = {}
cities = []
for region in location_mapping:
    for country in location_mapping[region]:
        for city in detailed_locations[region][country].keys():
            cities.append({"region": region, "country": country, "city": city})

for idx in range(1000, 3000):
    assigned_location = faker.random_element(cities)
    employees[idx] = {
        "employee_first_name": faker.first_name(),
        "employee_last_name": faker.last_name(),
        "region": assigned_location["region"],
        "country": assigned_location["country"],
        "city": assigned_location["city"],
    }

start_date = pd.to_datetime(df_5["timestamp"].min())
end_date = pd.to_datetime(df_5["timestamp"].max())
base_periods = pd.date_range(start=start_date, end=end_date, freq="3ME")
time_periods = [start_date] + [period + pd.Timedelta(days=random.randint(-28, 28)) for period in base_periods] + [end_date]

employee_assignments = {}
for emp_id, emp_data in employees.items():
    employee_assignments[emp_id] = {}
    for period_start, period_end in zip(time_periods[:-1], time_periods[1:]):
        assigned_address = faker.random_element(detailed_locations[emp_data["region"]][emp_data["country"]][emp_data["city"]])
        employee_assignments[emp_id][(period_start, period_end)] = assigned_address


def find_employee_for_sale(row):
    country = row["customer_country"]
    city = row["customer_city"]
    address = row["customer_address"]
    timestamp = pd.to_datetime(row["timestamp"])
    matching_employees = []

    for emp_id, time_assignments in employee_assignments.items():
        employee = employees[emp_id]
        if employee["country"] == country and employee["city"] == city:
            for (period_start, period_end), assigned_address in time_assignments.items():
                if (period_start <= timestamp <= period_end and assigned_address == address):
                    matching_employees.append(emp_id)
                    break
    return random.choice(matching_employees if matching_employees else [idx for idx, data in employees.items() if data["country"] == country and data["city"] == city])


df_6 = df_5.copy()
df_6["employee_id"] = df_6[["timestamp", "customer_country", "customer_city", "customer_address"]].apply(find_employee_for_sale, axis=1)
df_6["employee_first_name"] = df_6["employee_id"].map(
    lambda emp_id: employees[emp_id]["employee_first_name"]
)
df_6["employee_last_name"] = df_6["employee_id"].map(
    lambda emp_id: employees[emp_id]["employee_last_name"]
)
df_6.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,...,customer_last_name,product_subcategory,product_name,customer_country,customer_city,customer_address,product_manufacturer,employee_id,employee_first_name,employee_last_name
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,...,Johnson,Furniture,Perez And Sons Furniture Institution,USA,South Perryborough,459 Nicole Centers,Bosch,1212,Brittany,Brown
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,...,Walker,Accessories,Blue Accessories Eye,Brazil,North Randyborough,8195 John Stravenue,Nike,2711,Cynthia,Warner
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,...,Rhodes,Fragrances,Hickman Group Fragrances Affect,Canada,Mayfurt,1958 Randy Drive,Procter & Gamble,2636,Brandon,Coleman
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,...,Miller,Outerwear,Green Outerwear Benefit,New Zealand,Jessicaside,64248 Cassandra Falls Suite 406,Levi's,1096,John,Molina
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,...,Johnson,Children Literature,The Treatment Approach,New Zealand,East Jessicafort,2773 Emily Glens Suite 091,HarperCollins,1674,Edward,Tyler


In [10]:
email_domains = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "company.com"]

customer_emails = {}
for customer_id in df_6["customer_id"].unique():
    customer = df_6[df_6["customer_id"] == customer_id].iloc[0]
    customer_emails[customer_id] = f"{customer['customer_first_name'].lower()}.{customer['customer_last_name'].lower()}{customer_id}@{faker.random_element(email_domains)}"

employee_emails = {}
for employee_id in df_6["employee_id"].unique():
    employee = df_6[df_6["employee_id"] == employee_id].iloc[0]
    employee_emails[employee_id] = f"{employee['employee_first_name'].lower()}.{employee['employee_last_name'].lower()}{employee_id}@{faker.random_element(email_domains)}"

df_7 = df_6.copy()
df_7["customer_email"] = df_7["customer_id"].map(lambda cust_id: customer_emails[cust_id])
df_7["employee_email"] = df_7["employee_id"].map(lambda emp_id: employee_emails[emp_id])
df_7.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,...,product_name,customer_country,customer_city,customer_address,product_manufacturer,employee_id,employee_first_name,employee_last_name,customer_email,employee_email
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,...,Perez And Sons Furniture Institution,USA,South Perryborough,459 Nicole Centers,Bosch,1212,Brittany,Brown,danielle.johnson1993@outlook.com,brittany.brown1212@yahoo.com
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,...,Blue Accessories Eye,Brazil,North Randyborough,8195 John Stravenue,Nike,2711,Cynthia,Warner,joshua.walker3474@yahoo.com,cynthia.warner2711@gmail.com
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,...,Hickman Group Fragrances Affect,Canada,Mayfurt,1958 Randy Drive,Procter & Gamble,2636,Brandon,Coleman,jill.rhodes4564@yahoo.com,brandon.coleman2636@yahoo.com
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,...,Green Outerwear Benefit,New Zealand,Jessicaside,64248 Cassandra Falls Suite 406,Levi's,1096,John,Molina,patricia.miller1133@gmail.com,john.molina1096@gmail.com
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,...,The Treatment Approach,New Zealand,East Jessicafort,2773 Emily Glens Suite 091,HarperCollins,1674,Edward,Tyler,robert.johnson3626@outlook.com,edward.tyler1674@hotmail.com


In [11]:
card_types = {
    "Credit Card": ["Visa", "Mastercard", "American Express", "Discover", "JCB"],
    "Debit Card": ["Visa Debit", "Mastercard Debit", "Maestro"],
}
verification_methods = {
    "Credit Card": ["3D Secure", "SMS Verification", "Biometric", "PIN"],
    "Debit Card": ["PIN", "SMS Verification", "Chip and PIN"],
}

df_8 = df_7.copy()
df_8["card_type"] = df_8.apply(
    lambda row: (
        faker.random_element(card_types[row["payment_method"]])
        if row["payment_method"] in card_types
        else None
    ),
    axis=1,
)
df_8["verification_method"] = df_8.apply(
    lambda row: (
        faker.random_element(verification_methods[row["payment_method"]])
        if row["payment_method"] in verification_methods
        else None
    ),
    axis=1,
)
df_8.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,...,customer_city,customer_address,product_manufacturer,employee_id,employee_first_name,employee_last_name,customer_email,employee_email,card_type,verification_method
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,...,South Perryborough,459 Nicole Centers,Bosch,1212,Brittany,Brown,danielle.johnson1993@outlook.com,brittany.brown1212@yahoo.com,,
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,...,North Randyborough,8195 John Stravenue,Nike,2711,Cynthia,Warner,joshua.walker3474@yahoo.com,cynthia.warner2711@gmail.com,,
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,...,Mayfurt,1958 Randy Drive,Procter & Gamble,2636,Brandon,Coleman,jill.rhodes4564@yahoo.com,brandon.coleman2636@yahoo.com,Maestro,PIN
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,...,Jessicaside,64248 Cassandra Falls Suite 406,Levi's,1096,John,Molina,patricia.miller1133@gmail.com,john.molina1096@gmail.com,Mastercard Debit,Chip and PIN
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,...,East Jessicafort,2773 Emily Glens Suite 091,HarperCollins,1674,Edward,Tyler,robert.johnson3626@outlook.com,edward.tyler1674@hotmail.com,American Express,SMS Verification


In [12]:
device_types = ["Mobile", "Desktop", "Tablet", "Smart TV"]
browsers = ["Chrome", "Firefox", "Safari", "Edge", "Opera", "Brave", "Vivaldi"]

df_9 = df_8.copy()
df_9["device_type"] = [faker.random_element(device_types) for _ in range(len(df_9))]
df_9["browser_used"] = [faker.random_element(browsers) for _ in range(len(df_9))]
df_9.head()

Unnamed: 0,transaction_id,timestamp,customer_id,product_id,product_category,quantity,price,discount,payment_method,customer_age,...,product_manufacturer,employee_id,employee_first_name,employee_last_name,customer_email,employee_email,card_type,verification_method,device_type,browser_used
0,1,2023-01-01 00:00:00,1993,915,Home & Kitchen,8,103.3,0.23,Gift Card,27,...,Bosch,1212,Brittany,Brown,danielle.johnson1993@outlook.com,brittany.brown1212@yahoo.com,,,Desktop,Chrome
1,2,2023-01-01 00:01:00,3474,553,Clothing,9,180.28,0.31,Gift Card,53,...,Nike,2711,Cynthia,Warner,joshua.walker3474@yahoo.com,cynthia.warner2711@gmail.com,,,Desktop,Safari
2,3,2023-01-01 00:02:00,4564,248,Beauty & Personal Care,7,81.58,0.27,Debit Card,34,...,Procter & Gamble,2636,Brandon,Coleman,jill.rhodes4564@yahoo.com,brandon.coleman2636@yahoo.com,Maestro,PIN,Mobile,Chrome
3,4,2023-01-01 00:03:00,1133,948,Clothing,3,235.2,0.0,Debit Card,50,...,Levi's,1096,John,Molina,patricia.miller1133@gmail.com,john.molina1096@gmail.com,Mastercard Debit,Chip and PIN,Smart TV,Safari
4,5,2023-01-01 00:04:00,3626,284,Books,9,453.0,0.34,Credit Card,23,...,HarperCollins,1674,Edward,Tyler,robert.johnson3626@outlook.com,edward.tyler1674@hotmail.com,American Express,SMS Verification,Tablet,Brave


In [13]:
df_10 = df_9.copy()
df_10 = df_10[
    [
        "transaction_id",
        "timestamp",
        "customer_id",
        "customer_first_name",
        "customer_last_name",
        "customer_email",
        "customer_age",
        "customer_gender",
        "product_id",
        "product_name",
        "price",
        "product_manufacturer",
        "product_category",
        "product_subcategory",
        "customer_location",
        "customer_country",
        "customer_city",
        "customer_address",
        "employee_id",
        "employee_first_name",
        "employee_last_name",
        "employee_email",
        "quantity",
        "discount",
        "total_amount",
        "payment_method",
        "card_type",
        "verification_method",
        "device_type",
        "browser_used",
    ]
]

df_10 = df_10.rename(
    columns={
        "customer_location": "purchase_region",
        "customer_country": "purchase_country",
        "customer_city": "purchase_city",
        "customer_address": "purchase_address",
    }
)
df_10.head()

Unnamed: 0,transaction_id,timestamp,customer_id,customer_first_name,customer_last_name,customer_email,customer_age,customer_gender,product_id,product_name,...,employee_last_name,employee_email,quantity,discount,total_amount,payment_method,card_type,verification_method,device_type,browser_used
0,1,2023-01-01 00:00:00,1993,Danielle,Johnson,danielle.johnson1993@outlook.com,27,Female,915,Perez And Sons Furniture Institution,...,Brown,brittany.brown1212@yahoo.com,8,0.23,636.33,Gift Card,,,Desktop,Chrome
1,2,2023-01-01 00:01:00,3474,Joshua,Walker,joshua.walker3474@yahoo.com,53,Other,553,Blue Accessories Eye,...,Warner,cynthia.warner2711@gmail.com,9,0.31,1119.54,Gift Card,,,Desktop,Safari
2,3,2023-01-01 00:02:00,4564,Jill,Rhodes,jill.rhodes4564@yahoo.com,34,Other,248,Hickman Group Fragrances Affect,...,Coleman,brandon.coleman2636@yahoo.com,7,0.27,416.87,Debit Card,Maestro,PIN,Mobile,Chrome
3,4,2023-01-01 00:03:00,1133,Patricia,Miller,patricia.miller1133@gmail.com,50,Other,948,Green Outerwear Benefit,...,Molina,john.molina1096@gmail.com,3,0.0,705.6,Debit Card,Mastercard Debit,Chip and PIN,Smart TV,Safari
4,5,2023-01-01 00:04:00,3626,Robert,Johnson,robert.johnson3626@outlook.com,23,Female,284,The Treatment Approach,...,Tyler,edward.tyler1674@hotmail.com,9,0.34,2690.82,Credit Card,American Express,SMS Verification,Tablet,Brave


In [14]:
df_shuffled = df_10.sample(frac=1).reset_index(drop=True)

split_point = len(df_shuffled) // 2
df_online = df_shuffled[:split_point].reset_index(drop=True)
df_offline = df_shuffled[split_point:].reset_index(drop=True)

df_online = df_online.drop(
    columns=[
        "purchase_region",
        "purchase_country",
        "purchase_city",
        "purchase_address",
        "employee_id",
        "employee_first_name",
        "employee_last_name",
        "employee_email",
    ]
)
df_offline = df_offline.drop(
    columns=[
        "customer_id",
        "customer_first_name",
        "customer_last_name",
        "customer_email",
        "customer_age",
        "customer_gender",
        "device_type",
        "browser_used",
    ]
)

df_online.to_csv("online_retail_sales.csv", index=False)
df_offline.to_csv("offline_retail_sales.csv", index=False)