# Data Generator for Assignment 7

## Overview
This notebook generates sample datasets for the data visualization assignment.

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Create data directory
os.makedirs('data', exist_ok=True)

## Generate Sales Data

In [2]:
# Generate sales transactions
n_transactions = 1000
transaction_ids = [f'T{i:04d}' for i in range(1, n_transactions + 1)]
customer_ids = [f'C{i:04d}' for i in np.random.randint(1, 201, n_transactions)]
product_ids = [f'P{i:03d}' for i in np.random.randint(1, 101, n_transactions)]

# Generate transaction dates (last 6 months)
start_date = datetime.now() - timedelta(days=180)
dates = [start_date + timedelta(days=np.random.randint(0, 180)) for _ in range(n_transactions)]

# Generate quantities and prices
quantities = np.random.randint(1, 11, n_transactions)
unit_prices = np.random.uniform(10, 500, n_transactions)
total_amounts = quantities * unit_prices

# Generate store locations
store_locations = np.random.choice(['North', 'South', 'East', 'West'], n_transactions)

# Create sales DataFrame
sales_data = pd.DataFrame({
    'transaction_id': transaction_ids,
    'customer_id': customer_ids,
    'product_id': product_ids,
    'quantity': quantities,
    'unit_price': unit_prices,
    'total_amount': total_amounts,
    'transaction_date': [d.strftime('%Y-%m-%d') for d in dates],
    'store_location': store_locations
})

# Save to CSV
sales_data.to_csv('data/sales_data.csv', index=False)
print(f"Generated {len(sales_data)} sales transactions")
print(sales_data.head())

Generated 1000 sales transactions
  transaction_id customer_id product_id  quantity  unit_price  total_amount  \
0          T0001       C0103       P040        10  229.359978   2293.599785   
1          T0002       C0180       P083         3   75.794677    227.384031   
2          T0003       C0093       P042         5  479.387719   2396.938597   
3          T0004       C0015       P041         5  269.533265   1347.666324   
4          T0005       C0107       P006         4  128.527714    514.110856   

  transaction_date store_location  
0       2025-10-08          South  
1       2025-07-20          North  
2       2025-06-24           East  
3       2025-07-26          South  
4       2025-08-01          South  


## Generate Customer Data

In [3]:
# Generate customer information
n_customers = 200
customer_ids = [f'C{i:04d}' for i in range(1, n_customers + 1)]
names = [f'Customer {i}' for i in range(1, n_customers + 1)]

# Generate demographics
ages = np.random.randint(18, 80, n_customers)
genders = np.random.choice(['M', 'F'], n_customers)
cities = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 
                          'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose'], n_customers)
states = np.random.choice(['CA', 'NY', 'TX', 'FL', 'WA', 'IL', 'PA', 'OH', 'GA', 'NC'], n_customers)

# Generate registration dates
reg_dates = [datetime.now() - timedelta(days=np.random.randint(0, 365)) for _ in range(n_customers)]

# Create customer DataFrame
customer_data = pd.DataFrame({
    'customer_id': customer_ids,
    'customer_name': names,
    'age': ages,
    'gender': genders,
    'city': cities,
    'state': states,
    'registration_date': [d.strftime('%Y-%m-%d') for d in reg_dates]
})

# Save to CSV
customer_data.to_csv('data/customer_data.csv', index=False)
print(f"Generated {len(customer_data)} customers")
print(customer_data.head())

Generated 200 customers
  customer_id customer_name  age gender          city state registration_date
0       C0001    Customer 1   59      F     San Diego    NC        2025-05-04
1       C0002    Customer 2   76      M   San Antonio    CA        2025-04-15
2       C0003    Customer 3   64      M   San Antonio    PA        2025-06-24
3       C0004    Customer 4   70      M  Philadelphia    NC        2025-10-10
4       C0005    Customer 5   71      M       Phoenix    NY        2024-12-06


## Generate Product Data

In [4]:
# Generate product catalog
n_products = 100
product_ids = [f'P{i:03d}' for i in range(1, n_products + 1)]

# Generate product information
categories = np.random.choice(['Electronics', 'Clothing', 'Home & Garden', 'Books', 'Sports'], n_products)
brands = np.random.choice(['Brand A', 'Brand B', 'Brand C', 'Brand D', 'Brand E'], n_products)
product_names = [f'Product {i}' for i in range(1, n_products + 1)]

# Generate prices and stock
unit_prices = np.random.uniform(10, 500, n_products)
stock_quantities = np.random.randint(0, 100, n_products)

# Create product DataFrame
product_data = pd.DataFrame({
    'product_id': product_ids,
    'product_name': product_names,
    'category': categories,
    'brand': brands,
    'unit_price': unit_prices,
    'stock_quantity': stock_quantities
})

# Save to CSV
product_data.to_csv('data/product_data.csv', index=False)
print(f"Generated {len(product_data)} products")
print(product_data.head())

Generated 100 products
  product_id product_name     category    brand  unit_price  stock_quantity
0       P001    Product 1       Sports  Brand D  462.096118              36
1       P002    Product 2     Clothing  Brand D  256.162192              22
2       P003    Product 3       Sports  Brand C  170.663157              92
3       P004    Product 4  Electronics  Brand C  371.194883              84
4       P005    Product 5     Clothing  Brand D   27.989342              90


## Data Summary

In [5]:
# Display data summary
print("=== DATA SUMMARY ===")
print(f"Sales Data: {len(sales_data)} transactions")
print(f"Customer Data: {len(customer_data)} customers")
print(f"Product Data: {len(product_data)} products")
print("\nSales Data Columns:", sales_data.columns.tolist())
print("Customer Data Columns:", customer_data.columns.tolist())
print("Product Data Columns:", product_data.columns.tolist())

# Display sample data
print("\n=== SAMPLE SALES DATA ===")
print(sales_data.head())
print("\n=== SAMPLE CUSTOMER DATA ===")
print(customer_data.head())
print("\n=== SAMPLE PRODUCT DATA ===")
print(product_data.head())

=== DATA SUMMARY ===
Sales Data: 1000 transactions
Customer Data: 200 customers
Product Data: 100 products

Sales Data Columns: ['transaction_id', 'customer_id', 'product_id', 'quantity', 'unit_price', 'total_amount', 'transaction_date', 'store_location']
Customer Data Columns: ['customer_id', 'customer_name', 'age', 'gender', 'city', 'state', 'registration_date']
Product Data Columns: ['product_id', 'product_name', 'category', 'brand', 'unit_price', 'stock_quantity']

=== SAMPLE SALES DATA ===
  transaction_id customer_id product_id  quantity  unit_price  total_amount  \
0          T0001       C0103       P040        10  229.359978   2293.599785   
1          T0002       C0180       P083         3   75.794677    227.384031   
2          T0003       C0093       P042         5  479.387719   2396.938597   
3          T0004       C0015       P041         5  269.533265   1347.666324   
4          T0005       C0107       P006         4  128.527714    514.110856   

  transaction_date store_

## Validation

In [6]:
# Validate data integrity
print("=== DATA VALIDATION ===")

# Check for missing values
print("Missing values in sales_data:", sales_data.isnull().sum().sum())
print("Missing values in customer_data:", customer_data.isnull().sum().sum())
print("Missing values in product_data:", product_data.isnull().sum().sum())

# Check data types
print("\nSales data types:")
print(sales_data.dtypes)
print("\nCustomer data types:")
print(customer_data.dtypes)
print("\nProduct data types:")
print(product_data.dtypes)

print("\nData generation completed successfully!")

=== DATA VALIDATION ===
Missing values in sales_data: 0
Missing values in customer_data: 0
Missing values in product_data: 0

Sales data types:
transaction_id       object
customer_id          object
product_id           object
quantity              int64
unit_price          float64
total_amount        float64
transaction_date     object
store_location       object
dtype: object

Customer data types:
customer_id          object
customer_name        object
age                   int64
gender               object
city                 object
state                object
registration_date    object
dtype: object

Product data types:
product_id         object
product_name       object
category           object
brand              object
unit_price        float64
stock_quantity      int64
dtype: object

Data generation completed successfully!
