# Simulated Amazon E-commerce Dataset
This notebook generates synthetic e-commerce data to simulate Amazon Seller Central analytics. The dataset includes:
- Sales data
- Returns data
- Review data
- Product catalog

It follows realistic business logic for ad spend, pricing, return rates, and customer behavior.

In [1]:

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

random.seed(42)
np.random.seed(42)

# Parameters
num_products = 15
num_days = 180
start_date = datetime.today() - timedelta(days=num_days)

# Product catalog
categories = ['Bedding', 'Kitchen', 'Home Decor', 'Electronics', 'Bath']
product_catalog = []

for i in range(num_products):
    asin = f"ASIN{i:03d}"
    category = random.choice(categories)
    price = round(random.uniform(15, 120), 2)
    product_catalog.append({'ASIN': asin, 'Category': category, 'Price': price})

product_df = pd.DataFrame(product_catalog)
product_df.head()


Unnamed: 0,ASIN,Category,Price
0,ASIN000,Bedding,17.63
1,ASIN001,Home Decor,40.71
2,ASIN002,Kitchen,92.33
3,ASIN003,Bath,24.13
4,ASIN004,Electronics,18.34


In [2]:

# Daily sales data
sales_data = []
for _, row in product_df.iterrows():
    asin = row['ASIN']
    price = row['Price']
    for i in range(num_days):
        date = start_date + timedelta(days=i)
        is_promotion = random.choices([0, 1], weights=[0.85, 0.15])[0]
        base_units = np.random.poisson(8 if is_promotion else 4)
        ad_spend = round(base_units * price * (0.10 + 0.10 * random.random()), 2)
        revenue = round(base_units * price, 2)
        sales_data.append({
            'Date': date.date(),
            'ASIN': asin,
            'Units_Sold': base_units,
            'Revenue': revenue,
            'Price': price,
            'Ad_Spend': ad_spend,
            'Is_Promotion': is_promotion
        })

sales_df = pd.DataFrame(sales_data)
sales_df.head()


Unnamed: 0,Date,ASIN,Units_Sold,Revenue,Price,Ad_Spend,Is_Promotion
0,2025-01-22,ASIN000,5,88.15,17.63,11.98,0
1,2025-01-23,ASIN000,4,70.52,17.63,8.92,0
2,2025-01-24,ASIN000,4,70.52,17.63,10.29,0
3,2025-01-25,ASIN000,4,70.52,17.63,13.56,0
4,2025-01-26,ASIN000,2,35.26,17.63,4.56,0


In [3]:

# Returns data
returns_data = []
for _, row in sales_df.iterrows():
    if random.random() < 0.10:
        return_qty = random.randint(1, min(3, row['Units_Sold'])) if row['Units_Sold'] > 0 else 0
        if return_qty > 0:
            returns_data.append({
                'Date': row['Date'],
                'ASIN': row['ASIN'],
                'Return_Qty': return_qty,
                'Return_Reason': random.choice(['Defective', 'Not as Described', 'Changed Mind', 'Late Delivery'])
            })

returns_df = pd.DataFrame(returns_data)
returns_df.head()


Unnamed: 0,Date,ASIN,Return_Qty,Return_Reason
0,2025-01-26,ASIN000,1,Defective
1,2025-02-22,ASIN000,1,Late Delivery
2,2025-02-28,ASIN000,1,Late Delivery
3,2025-03-10,ASIN000,3,Changed Mind
4,2025-03-12,ASIN000,2,Not as Described


In [4]:

# Reviews linked to star ratings
review_texts = {
    5: ["Exceeded expectations.", "Perfect product.", "Super fast delivery!", "Excellent value!", "Highly recommend.", "Amazing quality!"],
    4: ["Very good, but not perfect.", "Satisfied with the purchase.", "Works as expected.", "Happy overall.", "Would buy again.", "Decent value for money."],
    3: ["Average quality.", "It’s okay.", "Not bad, not great.", "Neutral experience.", "Acceptable performance.", "Fine for the price."],
    2: ["Could be better.", "Not impressed.", "Had some issues.", "Poor packaging.", "Feels cheap.", "Wouldn’t recommend."],
    1: ["Very disappointed.", "Defective item.", "Would not buy again.", "Product not as described.", "Terrible experience.", "Arrived broken."]
}

review_data = []
for _, row in product_df.iterrows():
    for i in range(random.randint(15, 40)):
        date = start_date + timedelta(days=random.randint(0, num_days - 1))
        star_rating = random.choices([5, 4, 3, 2, 1], weights=[50, 25, 15, 5, 5])[0]
        review_text = random.choice(review_texts[star_rating])
        review_data.append({
            'Date': date.date(),
            'ASIN': row['ASIN'],
            'Star_Rating': star_rating,
            'Review_Text': review_text
        })

reviews_df = pd.DataFrame(review_data)
reviews_df.head()


Unnamed: 0,Date,ASIN,Star_Rating,Review_Text
0,2025-01-24,ASIN000,5,Super fast delivery!
1,2025-04-04,ASIN000,5,Excellent value!
2,2025-03-26,ASIN000,4,Decent value for money.
3,2025-06-01,ASIN000,5,Super fast delivery!
4,2025-02-20,ASIN000,3,Fine for the price.


In [5]:
import os

# Use raw string to avoid issues with backslashes on Windows
output_dir = r"C:\Users\Sinah\OneDrive\Email attachments\Desktop\projects\amazon_bi_showcase\data\raw"
os.makedirs(output_dir, exist_ok=True)

sales_df.to_csv(f"{output_dir}\\sales_data.csv", index=False)
returns_df.to_csv(f"{output_dir}\\returns_data.csv", index=False)
reviews_df.to_csv(f"{output_dir}\\reviews_data.csv", index=False)
product_df.to_csv(f"{output_dir}\\product_catalog.csv", index=False)

print("Files saved in:", output_dir)


Files saved in: C:\Users\Sinah\OneDrive\Email attachments\Desktop\projects\amazon_bi_showcase\data\raw
