## ETL RETAIL DATA
### 1. EXTRACTION

#### Imports and Setting Seeds

In [10]:
import pandas as pd       # For data manipulation and analysis
import numpy as np        # For numerical operations
from faker import Faker   # For generating realistic fake data
import random             # For randomization
from datetime import datetime




# Fix the randomness so the output is the same every time  the script runs
faker = Faker()
random.seed(42)



#### Configuration and Product Catalog


In [11]:
NUM_RECORDS = 1000        # Total rows in dataset
START_YEAR = 2023         # Start of date range
END_YEAR = 2025           # End of date range


products = {
    "Electronics": [
        ("E101", "Bluetooth Speaker"),
        ("E102", "Smartphone Tripod"),
        ("E103", "Noise Cancelling Headphones")
    ],
    "Clothing": [
        ("C201", "Denim Jacket"),
        ("C202", "Sports Socks"),
        ("C203", "Wool Scarf")
    ],
    "Home Decor": [
        ("H301", "Ceramic Mug"),
        ("H302", "Decorative Cushion"),
        ("H303", "Floor Rug")
    ],
    "Toys": [
        ("T401", "Lego Set"),
        ("T402", "RC Car"),
        ("T403", "Stuffed Bear")
    ]
}


#### Flatten Product List
Purpose: Convert the nested products dictionary into a single flat list.

Before: Products are stored by category → each category has multiple (code, name) pairs.

After: A list where each entry is (ProductCode, ProductName, Category).

Why? Makes it easier to randomly select any product without first choosing a category.

In [12]:
product_list = [
    (code, name, cat)
    for cat, items in products.items()
    for code, name in items
]


#### Function to generate a random datetime between two years
A small function is made to give a random date between two years. This keeps dates realistic.

In [13]:



def random_date(start_year, end_year):
    return faker.date_time_between(
        start_date=datetime(start_year, 1, 1),   # Earliest date
        end_date=datetime(end_year, 8, 12)       # Latest date
    )

    


#### Generate Synthetic Data
We loop to create each record. Products, prices, and other details are chosen randomly. Some values are made wrong or missing on purpose to make the dataset messy.It Keeps data realistic but imperfect for transformation practice, mimicking real world messy data.

In [14]:
data = []
for _ in range(NUM_RECORDS):
    pid, pname, category = random.choice(product_list)  # Random product
    data.append({
        "InvoiceNo": faker.uuid4(),  # Unique invoice ID
        "ProductID": pid,
        "ProductName": pname if random.random() > 0.02 else None,  # 2% missing
        "Category": category,
        "Quantity": random.choice([random.randint(1, 20), None]),  # Missing some
        "UnitPrice": random.choice([round(random.uniform(5, 300), 2), -10.0]),  # Outlier
        "InvoiceDate": random_date(START_YEAR, END_YEAR),  # Random date
        "CustomerID": random.choice([random.randint(1, 100), None]),  # Missing IDs
        "Country": random.choice([faker.country(), ""])  # Empty country
    })


#### Create DataFrame

In [15]:
# Convert list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data)

# Display first 5 rows for quick verification
print(df.head())


                              InvoiceNo ProductID        ProductName  \
0  9ac549b0-4a3c-4b18-9b5a-ac557634a295      T402             RC Car   
1  fe87fe9a-6a49-469f-b1a2-6bcc12843561      E101  Bluetooth Speaker   
2  e1a899ee-3677-4c49-9fe3-c90bc61ad7c9      C201       Denim Jacket   
3  e054706d-2329-4912-a7b0-44048c630fad      C202       Sports Socks   
4  612dc3c3-9dbe-45ab-9b07-e316fcf43ec6      E101  Bluetooth Speaker   

      Category  Quantity  UnitPrice         InvoiceDate  CustomerID Country  
0         Toys       9.0      70.85 2023-01-08 10:34:34        87.0          
1  Electronics       7.0     154.08 2024-06-20 13:55:55        72.0          
2     Clothing       9.0     228.85 2024-05-05 02:13:01         NaN          
3     Clothing      11.0      32.36 2024-08-13 12:12:22         NaN          
4  Electronics      18.0     -10.00 2024-06-08 18:58:08         NaN          


In [17]:
# Save to CSV
df.to_csv("retail_data.csv", index=False)
print("Synthetic retail data generated and saved to 'retail_data.csv'.")



Synthetic retail data generated and saved to 'retail_data.csv'.


### TRANSFORMATION 

#### Handle Outliers

In [18]:
# Remove rows with negative quantity or zero/negative prices
df = df[(df['Quantity'] >= 0) & (df['UnitPrice'] > 0)]


#### Handle Missing Values


In [21]:
# Fill missing product names and empty countries
df['ProductName'] = df['ProductName'].fillna('Unknown Product')
df['Country'] = df['Country'].replace('', 'Unknown')

# Remove rows without CustomerID
df = df.dropna(subset=['CustomerID'])


#### Remove Outliers

In [22]:
# Remove rows with negative quantity or non-positive prices
df = df[(df['Quantity'] >= 0) & (df['UnitPrice'] > 0)]


#### Calculate Total Sales

In [23]:
# Compute revenue per transaction
df['TotalSales'] = df['Quantity'] * df['UnitPrice']


#### Create Customer Salary

In [24]:
# Summarize total purchases per customer
customer_summary = df.groupby('CustomerID').agg({
    'TotalSales': 'sum',
    'Country': 'first'
}).reset_index()


#### Filter Last Year Sales

In [25]:
# Keep only sales from Aug 12, 2024 onwards
cutoff_date = datetime(2024, 8, 12)
df_last_year = df[df['InvoiceDate'] >= cutoff_date]


#### Check Results

In [26]:
df_last_year.info()           # Info about filtered dataset
customer_summary.head()       # First few rows of customer summary


<class 'pandas.core.frame.DataFrame'>
Index: 48 entries, 28 to 960
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    48 non-null     object        
 1   ProductID    48 non-null     object        
 2   ProductName  48 non-null     object        
 3   Category     48 non-null     object        
 4   Quantity     48 non-null     float64       
 5   UnitPrice    48 non-null     float64       
 6   InvoiceDate  48 non-null     datetime64[ns]
 7   CustomerID   48 non-null     float64       
 8   Country      48 non-null     object        
 9   TotalSales   48 non-null     float64       
dtypes: datetime64[ns](1), float64(4), object(5)
memory usage: 4.1+ KB


Unnamed: 0,CustomerID,TotalSales,Country
0,3.0,570.28,Thailand
1,4.0,3887.24,Bulgaria
2,6.0,1579.6,Tokelau
3,8.0,1892.64,Marshall Islands
4,9.0,253.32,Gabon


### Loading

#### Connect to SQLite

In [41]:
import sqlite3

conn = sqlite3.connect("retail_dw.db")
cursor = conn.cursor()


#### Create Dimension and Fact Table

In [42]:
# Create Customer, Time, Product dimensions and SalesFact table
cursor.execute("""
CREATE TABLE IF NOT EXISTS CustomerDim (
    CustomerPK INTEGER PRIMARY KEY,
    CustomerCode TEXT,
    Country TEXT
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS TimeDim (
    TimePK INTEGER PRIMARY KEY,
    InvoiceDate TEXT,
    Year INTEGER,
    Month INTEGER,
    Day INTEGER
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS ProductDim (
    ProductPK INTEGER PRIMARY KEY,
    ProductID TEXT,
    ProductName TEXT,
    Category TEXT
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS SalesFact (
    InvoiceNo TEXT PRIMARY KEY,
    TimePK INTEGER,
    ProductPK INTEGER,
    CustomerPK INTEGER,
    Quantity INTEGER,
    UnitPrice REAL,
    TotalSales REAL,
    FOREIGN KEY(CustomerPK) REFERENCES CustomerDim(CustomerPK),
    FOREIGN KEY(ProductPK) REFERENCES ProductDim(ProductPK),
    FOREIGN KEY(TimePK) REFERENCES TimeDim(TimePK)
)
""")

conn.commit()
print("Tables created successfully!")


Tables created successfully!


#### Prepare Dimension Data

In [43]:
# Customer dimension
df_customer = df_last_year[['CustomerID', 'Country']].drop_duplicates().reset_index(drop=True)
df_customer['CustomerPK'] = df_customer.index + 1  # Primary key
df_customer.to_sql('CustomerDim', conn, if_exists='replace', index=False)

# Time dimension
df_time = df_last_year[['InvoiceDate']].drop_duplicates().reset_index(drop=True)
df_time['TimePK'] = df_time.index + 1
df_time['Year'] = df_time['InvoiceDate'].dt.year
df_time['Month'] = df_time['InvoiceDate'].dt.month
df_time['Day'] = df_time['InvoiceDate'].dt.day
df_time.to_sql('TimeDim', conn, if_exists='replace', index=False)

# Product dimension
df_product = df_last_year[['ProductID','ProductName','Category']].drop_duplicates().reset_index(drop=True)
df_product['ProductPK'] = df_product.index + 1
df_product.to_sql('ProductDim', conn, if_exists='replace', index=False)
print(f"ProductDim loaded: {len(df_product)} rows")
display(df_product.head())


ProductDim loaded: 12 rows


Unnamed: 0,ProductID,ProductName,Category,ProductPK
0,E102,Smartphone Tripod,Electronics,1
1,H303,Floor Rug,Home Decor,2
2,T401,Lego Set,Toys,3
3,C202,Sports Socks,Clothing,4
4,T403,Stuffed Bear,Toys,5


#### Prepare and Load Fact Table

In [44]:
# Merge dimension keys into fact table
df_fact = df_last_year.merge(df_customer, on='CustomerID')
df_fact = df_fact.merge(df_time, on='InvoiceDate')
df_fact = df_fact.merge(df_product, on=['ProductID','ProductName','Category'])

# Calculate TotalSales
df_fact['TotalSales'] = df_fact['Quantity'] * df_fact['UnitPrice']

# Select and rename columns for SalesFact
df_fact_final = df_fact[['InvoiceNo','TimePK','ProductPK','CustomerPK','Quantity','UnitPrice','TotalSales']]
df_fact_final.to_sql('SalesFact', conn, if_exists='replace', index=False)

# Commit and close connection
conn.commit()
conn.close()
print(f"FactSales loaded: {len(df_fact_final)} rows")
display(df_fact_final.head())


FactSales loaded: 66 rows


Unnamed: 0,InvoiceNo,TimePK,ProductPK,CustomerPK,Quantity,UnitPrice,TotalSales
0,6097536c-f044-4427-8497-b0339e47baa1,1,1,1,19.0,175.78,3339.82
1,6097536c-f044-4427-8497-b0339e47baa1,1,1,25,19.0,175.78,3339.82
2,15e55b70-f7ea-4301-8097-03cc4ac5d802,2,1,2,5.0,251.21,1256.05
3,15e55b70-f7ea-4301-8097-03cc4ac5d802,2,1,7,5.0,251.21,1256.05
4,2194f7fc-74e4-4f8e-9d76-ee968053e2ac,3,2,3,5.0,118.0,590.0
