In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Function to generate dummy data for the fact table (Sales)
def generate_sales_data(num_rows=100):
    np.random.seed(42)
    products = ['Product A', 'Product B', 'Product C']
    customers = ['Customer 1', 'Customer 2', 'Customer 3']
    dates = pd.date_range(start='2023-01-01', periods=num_rows, freq='D')
    locations = ['Location 1', 'Location 2', 'Location 3']
    
    sales_data = {
        'Date': np.random.choice(dates, num_rows),
        'ProductID': np.random.choice([1, 2, 3], num_rows),
        'CustomerID': np.random.choice([101, 102, 103], num_rows),
        'LocationID': np.random.choice([201, 202, 203], num_rows),
        'Quantity': np.random.randint(50, 200, num_rows),
        'Revenue': np.random.randint(1000, 5000, num_rows)
    }
    
    sales_df = pd.DataFrame(sales_data)
    sales_df['Date'] = pd.to_datetime(sales_df['Date'])
    
    return sales_df

# Function to generate dummy data for the dimension tables
def generate_dimension_data(table_name, num_rows=10):
    np.random.seed(42)
    
    if table_name == 'Products':
        data = {
            'ProductID': range(1, num_rows + 1),
            'ProductName': [f'Product {i}' for i in range(1, num_rows + 1)],
            'Category': np.random.choice(['Electronics', 'Clothing', 'Stationery'], num_rows)
        }
    elif table_name == 'Customers':
        data = {
            'CustomerID': range(101, 101 + num_rows),
            'CustomerName': [f'Customer {i}' for i in range(1, num_rows + 1)],
            'Segment': np.random.choice(['Individual', 'Corporate'], num_rows)
        }
    elif table_name == 'Dates':
        data = {
            'Date': pd.date_range(start='2022-01-01', periods=num_rows, freq='D'),
            'Day': [date.day for date in pd.date_range(start='2022-01-01', periods=num_rows, freq='D')],
            'Month': [date.month for date in pd.date_range(start='2022-01-01', periods=num_rows, freq='D')],
            'Year': [date.year for date in pd.date_range(start='2022-01-01', periods=num_rows, freq='D')]
        }
    elif table_name == 'Locations':
        data = {
            'LocationID': range(201, 201 + num_rows),
            'LocationName': [f'Location {i}' for i in range(1, num_rows + 1)],
            'Region': np.random.choice(['North', 'South', 'East', 'West'], num_rows)
        }
    else:
        raise ValueError(f"Unknown table name: {table_name}")
    
    return pd.DataFrame(data)

# Generate dummy data for the fact table (Sales)
sales_df = generate_sales_data(num_rows=100)

# Generate dummy data for dimension tables
products_df = generate_dimension_data('Products', num_rows=10)
customers_df = generate_dimension_data('Customers', num_rows=10)
dates_df = generate_dimension_data('Dates', num_rows=10)
locations_df = generate_dimension_data('Locations', num_rows=10)

# Display the generated data
display(sales_df.head())
display(products_df.head())
display(customers_df.head())
display(dates_df.head())
display(locations_df.head())


Unnamed: 0,Date,ProductID,CustomerID,LocationID,Quantity,Revenue
0,2023-02-21,2,102,202,62,2015
1,2023-04-03,1,103,203,109,2348
2,2023-01-15,1,101,202,184,1515
3,2023-03-13,1,102,201,106,4087
4,2023-03-02,1,101,203,85,3839


Unnamed: 0,ProductID,ProductName,Category
0,1,Product 1,Stationery
1,2,Product 2,Electronics
2,3,Product 3,Stationery
3,4,Product 4,Stationery
4,5,Product 5,Electronics


Unnamed: 0,CustomerID,CustomerName,Segment
0,101,Customer 1,Individual
1,102,Customer 2,Corporate
2,103,Customer 3,Individual
3,104,Customer 4,Individual
4,105,Customer 5,Individual


Unnamed: 0,Date,Day,Month,Year
0,2022-01-01,1,1,2022
1,2022-01-02,2,1,2022
2,2022-01-03,3,1,2022
3,2022-01-04,4,1,2022
4,2022-01-05,5,1,2022


Unnamed: 0,LocationID,LocationName,Region
0,201,Location 1,East
1,202,Location 2,West
2,203,Location 3,North
3,204,Location 4,East
4,205,Location 5,East
