## Customer Purchase Data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Sample product data (for 6 products as an example)
products = {
    1001: 'Mamaearth Onion Hair Oil',
    1002: 'Mamaearth Vitamin C Face Cream',
    1003: 'Mamaearth Charcoal Body Wash',
    1004: 'Mamaearth Tea Tree Face Wash',
    1005: 'Mamaearth Anti-Hair Fall Shampoo',
    1006: 'Mamaearth Ubtan Face Mask'
}

# Sample customer data (180-250 unique customers)
num_customers = random.randint(180, 250)
customers = [f'C{i:03d}' for i in range(1, num_customers + 1)]

# Generate dates for 122 weeks (854 days)
start_date = datetime(2023, 4, 1)
end_date = start_date + timedelta(days=854-1)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate random purchases
purchase_data = []
for customer in customers:
    # Randomize purchase frequency (some customers buy every 7 days, others every 30 days)
    purchase_frequency = random.choice([7, 14, 30])  # Can buy every week, two weeks, or month
    
    # Generate random number of purchases per customer over 122 weeks
    num_purchases = random.randint(5, 12)  # Each customer will make 5-12 purchases over the span
    
    # Select random purchase dates for the customer
    purchase_dates = [start_date + timedelta(days=random.randint(0, 854)) for _ in range(num_purchases)]
    purchase_dates = sorted(list(set(purchase_dates)))  # Remove duplicates and sort
    
    for date in purchase_dates:
        num_products = np.random.randint(1, 3)  # Randomly choose 1 or 2 products per purchase
        for _ in range(num_products):
            product_id = np.random.choice(list(products.keys()))
            quantity = np.random.randint(1, 3)  # Random quantity
            price = 399 if product_id == 1001 else 499 if product_id == 1002 else 349  # Example pricing logic
            total_price = price * quantity
            payment_method = np.random.choice(['Credit Card', 'Debit Card', 'UPI', 'Net Banking'])
            discount = np.random.choice([0, 5, 10, 15, 20])  # Random discount
            location = np.random.choice(['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Pune'])
            rating = np.random.uniform(3.5, 5.0)  # Random rating between 3.5 and 5.0
            purchase_data.append([f"Purchase_{len(purchase_data)+1}", customer, product_id, products[product_id], date, quantity, total_price, payment_method, f"{discount}%", location, round(rating, 1)])

# Create DataFrame
df = pd.DataFrame(purchase_data, columns=['Purchase ID', 'Customer ID', 'Product ID', 'Product Name', 'Purchase Date', 'Quantity', 'Total Price (INR)', 'Payment Method', 'Discount Applied', 'Location', 'Customer Rating'])

# Show a sample
print(df.head())

# Save to CSV
df.to_csv('customer_purchase_data.csv', index=False)


  Purchase ID Customer ID  Product ID                      Product Name  \
0  Purchase_1        C001        1004      Mamaearth Tea Tree Face Wash   
1  Purchase_2        C001        1004      Mamaearth Tea Tree Face Wash   
2  Purchase_3        C001        1005  Mamaearth Anti-Hair Fall Shampoo   
3  Purchase_4        C001        1005  Mamaearth Anti-Hair Fall Shampoo   
4  Purchase_5        C001        1005  Mamaearth Anti-Hair Fall Shampoo   

  Purchase Date  Quantity  Total Price (INR) Payment Method Discount Applied  \
0    2023-08-12         2                698    Credit Card               5%   
1    2023-08-12         2                698    Net Banking              20%   
2    2023-10-23         2                698    Net Banking              15%   
3    2023-10-23         1                349    Net Banking              20%   
4    2023-11-19         1                349    Net Banking              15%   

    Location  Customer Rating  
0      Delhi              4.6  
1   

In [2]:
df.shape

(2825, 11)

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Sample product data (for 6 products as an example)
products = {
    1001: 'Mamaearth Onion Hair Oil',
    1002: 'Mamaearth Vitamin C Face Cream',
    1003: 'Mamaearth Charcoal Body Wash',
    1004: 'Mamaearth Tea Tree Face Wash',
    1005: 'Mamaearth Anti-Hair Fall Shampoo',
    1006: 'Mamaearth Ubtan Face Mask'
}

# Sample customer data (180-250 unique customers)
num_customers = random.randint(180, 250)
customers = [f'C{i:03d}' for i in range(1, num_customers + 1)]

# Generate dates for 122 weeks (854 days)
start_date = datetime(2023, 4, 1)
end_date = start_date + timedelta(days=854-1)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Number of transactions per day
transactions_per_day = 50

# Generate random purchases
purchase_data = []
purchase_id = 1
for date in date_range:
    for _ in range(transactions_per_day):  # 50 transactions per day
        customer = random.choice(customers)
        num_products = np.random.randint(1, 3)  # Randomly choose 1 or 2 products per purchase
        for _ in range(num_products):
            product_id = np.random.choice(list(products.keys()))
            quantity = np.random.randint(1, 3)  # Random quantity
            price = 399 if product_id == 1001 else 499 if product_id == 1002 else 349  # Example pricing logic
            total_price = price * quantity
            payment_method = np.random.choice(['Credit Card', 'Debit Card', 'UPI', 'Net Banking'])
            discount = np.random.choice([0, 5, 10, 15, 20])  # Random discount
            location = np.random.choice(['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Pune'])
            rating = np.random.uniform(3.5, 5.0)  # Random rating between 3.5 and 5.0
            purchase_data.append([f"Purchase_{purchase_id}", customer, product_id, products[product_id], date, quantity, total_price, payment_method, f"{discount}%", location, round(rating, 1)])
            purchase_id += 1

# Create DataFrame
df = pd.DataFrame(purchase_data, columns=['Purchase ID', 'Customer ID', 'Product ID', 'Product Name', 'Purchase Date', 'Quantity', 'Total Price (INR)', 'Payment Method', 'Discount Applied', 'Location', 'Customer Rating'])

# Show a sample
print(df.head())

# Save to CSV
df.to_csv('customer_purchase_data_large.csv', index=False)


  Purchase ID Customer ID  Product ID                  Product Name  \
0  Purchase_1        C054        1006     Mamaearth Ubtan Face Mask   
1  Purchase_2        C111        1006     Mamaearth Ubtan Face Mask   
2  Purchase_3        C090        1006     Mamaearth Ubtan Face Mask   
3  Purchase_4        C090        1003  Mamaearth Charcoal Body Wash   
4  Purchase_5        C129        1001      Mamaearth Onion Hair Oil   

  Purchase Date  Quantity  Total Price (INR) Payment Method Discount Applied  \
0    2023-04-01         2                698    Credit Card              15%   
1    2023-04-01         1                349            UPI               5%   
2    2023-04-01         2                698     Debit Card              10%   
3    2023-04-01         1                349    Net Banking              10%   
4    2023-04-01         2                798            UPI              15%   

    Location  Customer Rating  
0  Bangalore              4.6  
1    Chennai              4.

In [4]:
df.shape

(64000, 11)

In [5]:
df['Customer ID'].nunique()

233

In [6]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [7]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker
fake = Faker('en_IN')  # Using 'en_IN' locale to generate Indian names

# Number of customers
num_customers = 233

# Sample locations
locations = ['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Pune']

# Sample skin types
skin_types = ['Dry', 'Oily', 'Combination', 'Normal']

# Sample hair types
hair_types = ['Straight', 'Curly', 'Wavy', 'Bald']

# Sample purchase frequencies
purchase_frequencies = ['Weekly', 'Monthly', 'Bi-Monthly', 'Occasionally']

# Sample preferred categories
categories = ['Skincare', 'Hair Care', 'Body Care']

# Create Customer IDs
customers = [f'C{i:03d}' for i in range(1, num_customers + 1)]

# Generate customer demographic data
customer_data = []
for customer in customers:
    # Generate random name using Faker
    name = fake.name()

    age = random.randint(18, 60)  # Random age between 18 and 60
    gender = random.choice(['Male', 'Female', 'Other'])
    location = random.choice(locations)
    skin_type = random.choice(skin_types)
    hair_type = random.choice(hair_types)
    purchase_frequency = random.choice(purchase_frequencies)
    preferred_category = random.choice(categories)
    avg_spend = random.randint(800, 2000)  # Random average spend between 800 and 2000 INR

    customer_data.append([customer, name, age, gender, location, skin_type, hair_type, purchase_frequency, preferred_category, avg_spend])

# Create DataFrame
df_customers = pd.DataFrame(customer_data, columns=['Customer ID', 'Name', 'Age', 'Gender', 'Location', 'Skin Type', 'Hair Type', 'Purchase Frequency', 'Preferred Category', 'Average Spend (INR)'])

# Show a sample
print(df_customers.head())

# Save to CSV
df_customers.to_csv('customer_demographic_data_with_real_names.csv', index=False)


  Customer ID              Name  Age  Gender   Location    Skin Type  \
0        C001   Faris Chaudhari   50    Male    Chennai         Oily   
1        C002      Jagrati Nori   56  Female  Bangalore  Combination   
2        C003     Anjali Talwar   41    Male     Mumbai         Oily   
3        C004       Jairaj Gill   21   Other      Delhi         Oily   
4        C005  Zehaan Varughese   44   Other     Mumbai  Combination   

  Hair Type Purchase Frequency Preferred Category  Average Spend (INR)  
0      Wavy             Weekly           Skincare                 1058  
1      Bald             Weekly          Hair Care                 1410  
2      Bald         Bi-Monthly           Skincare                 1436  
3     Curly            Monthly           Skincare                 1463  
4      Wavy            Monthly          Body Care                 1982  


In [8]:
df_customers.shape

(233, 10)

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Sample product data (for 6 products as an example)
products = {
    1001: 'Mamaearth Onion Hair Oil',
    1002: 'Mamaearth Vitamin C Face Cream',
    1003: 'Mamaearth Charcoal Body Wash',
    1004: 'Mamaearth Tea Tree Face Wash',
    1005: 'Mamaearth Anti-Hair Fall Shampoo',
    1006: 'Mamaearth Ubtan Face Mask'
}

# Generate a list of promotions
promotion_types = ['Flash Sale', 'Seasonal Discount', 'Holiday Sale', 'Weekend Offer', 'Buy 1 Get 1 Free']
promotion_duration = [7, 10, 14, 30]  # Promotion duration in days

# We assume 3-5 promotions per product over a year
promotion_data = []
promotion_id = 1

for product_id in products.keys():
    # Random number of promotions for this product
    num_promotions = random.randint(3, 5)
    
    for _ in range(num_promotions):
        promotion_type = random.choice(promotion_types)
        discount = random.choice([10, 15, 20, 25, 30])  # Random discount between 10% and 30%
        
        # Random start and end dates for the promotion
        start_date = datetime(2023, random.randint(1, 12), random.randint(1, 28))
        duration = random.choice(promotion_duration)
        end_date = start_date + timedelta(days=duration)
        
        # Random units sold during promotion (simulating demand)
        units_sold = random.randint(100, 1000)  # Random units sold per promotion
        normal_sales = random.randint(500, 2000)  # Normal sales without promotion
        sales_revenue = units_sold * 399 * (1 - discount / 100)  # Revenue with discount
        revenue_growth = ((sales_revenue - normal_sales) / normal_sales) * 100  # Revenue growth percentage
        
        # Promo effectiveness (based on revenue growth)
        if revenue_growth > 50:
            promo_effectiveness = 'High'
        elif revenue_growth > 20:
            promo_effectiveness = 'Medium'
        else:
            promo_effectiveness = 'Low'
        
        promotion_data.append([promotion_id, product_id, promotion_type, start_date, end_date, discount, units_sold, sales_revenue, revenue_growth, duration, promo_effectiveness])
        promotion_id += 1

# Create DataFrame
df_promotions = pd.DataFrame(promotion_data, columns=['Promotion ID', 'Product ID', 'Promotion Type', 'Start Date', 'End Date', 'Discount (%)', 'Units Sold', 'Sales (INR)', 'Revenue Growth (%)', 'Promotion Duration (Days)', 'Promo Effectiveness'])

# Show a sample
print(df_promotions.head())

# Save to CSV
df_promotions.to_csv('sales_trends_and_promotions.csv', index=False)

   Promotion ID  Product ID    Promotion Type Start Date   End Date  \
0             1        1001  Buy 1 Get 1 Free 2023-07-26 2023-08-09   
1             2        1001     Weekend Offer 2023-11-09 2023-11-19   
2             3        1001  Buy 1 Get 1 Free 2023-12-18 2024-01-01   
3             4        1001        Flash Sale 2023-10-14 2023-11-13   
4             5        1002      Holiday Sale 2023-01-14 2023-01-28   

   Discount (%)  Units Sold  Sales (INR)  Revenue Growth (%)  \
0            30         358      99989.4        11059.531250   
1            10         714     256397.4        35810.000000   
2            15         684     231978.6        20054.526499   
3            15         222      75291.3         7466.964824   
4            20         933     297813.6        15954.641509   

   Promotion Duration (Days) Promo Effectiveness  
0                         14                High  
1                         10                High  
2                         14       