# OLTP - Database
The database, as shown in the OLTP.jpg diagram, is meant to represent activity related to an e-commerce site. 

In this file I will:

1- Create the database (create_db.py)

2- Populate it with synthetic data (populate.py)

3- Error inyect the synthetic data (error_inyect.py)

The idea is to have dirty data to prove our ETL incremental pipeline is resilient.

In [1]:
import pandas as pd

import sqlite3
from datetime import date

INITIAL_DATE = date(2026, 2, 2)

In [2]:
conn = sqlite3.connect('ecommerce-OLTP.db')

In [3]:
# Creates db in sqlite.
import scripts.create_db as create_db

create_db.main()

In [4]:
# Populates our db.
import scripts.populate as populate

populate.populate_db_first_time(conn, num_users=100, num_products=20, join_date=INITIAL_DATE)


In [None]:
# Import error injection module
import scripts.generate_invalid as generate_invalid
from datetime import date

In [5]:
# Connect to SQLite database
conn = sqlite3.connect('ecommerce-OLTP.db')

## Testing change functionality

### Users

In [6]:
df_users = pd.read_sql_query('SELECT * FROM users', conn)
df_users

Unnamed: 0,user_id,name,email,join_date
0,1,Elijah Mcknight,desireekent@example.com,2026-02-02
1,2,Jessica Jones,qwalker@example.com,2026-02-02
2,3,Andrea Cole,shawncastaneda@example.net,2026-02-02
3,4,Dennis Mora,lisamcdonald@example.net,2026-02-02
4,5,Charles Fields,gshields@example.org,2026-02-02
...,...,...,...,...
95,96,Ernest Wilcox,bonillalarry@example.net,2026-02-02
96,97,Ronald Curtis,ngalvan@example.net,2026-02-02
97,98,Eddie Arroyo,james76@example.net,2026-02-02
98,99,Jimmy Thornton,solsen@example.net,2026-02-02


In [7]:
populate.change_existent_users(conn, 100)

In [8]:
df_users = pd.read_sql_query('SELECT * FROM users', conn)
df_users

Unnamed: 0,user_id,name,email,join_date
0,1,Elijah Mcknight,patricia87@example.com,2026-02-02
1,2,Jessica Jones,torresheather@example.org,2026-02-02
2,3,Andrea Cole,dylanyoung@example.com,2026-02-02
3,4,Dennis Mora,deborah00@example.com,2026-02-02
4,5,Charles Fields,rflynn@example.org,2026-02-02
...,...,...,...,...
95,96,Ernest Wilcox,jmonroe@example.com,2026-02-02
96,97,Ronald Curtis,reginaneal@example.org,2026-02-02
97,98,Eddie Arroyo,whiteryan@example.net,2026-02-02
98,99,Jimmy Thornton,brandon41@example.net,2026-02-02


### Products

In [9]:
df_products = pd.read_sql_query('SELECT * FROM products', conn)
df_products

Unnamed: 0,product_id,name,category,price,stock
0,1,No Commercial,Clothing,213.71,150
1,2,Hair Health,Clothing,223.45,69
2,3,Put Price,Beauty,209.83,138
3,4,Learn Study,Accessories,19.84,147
4,5,Officer Business,Footwear,434.99,35
5,6,Chance Increase,Clothing,27.57,108
6,7,Later Police,Beauty,182.92,178
7,8,Opportunity How,Footwear,14.38,49
8,9,Person Food,Clothing,201.49,198
9,10,Table Officer,Electronics,340.4,162


In [10]:
populate.change_existent_products(conn, 20, price_range=[20, 1000])

In [15]:
df_products = pd.read_sql_query('SELECT * FROM products', conn)
df_products

Unnamed: 0,product_id,name,category,price,stock
0,1,No Commercial,Clothing,632.64,111
1,2,Hair Health,Clothing,434.31,29
2,3,Put Price,Beauty,360.73,93
3,4,Learn Study,Accessories,330.82,103
4,5,Officer Business,Footwear,67.72,17
5,6,Chance Increase,Clothing,420.77,65
6,7,Later Police,Beauty,326.51,154
7,8,Opportunity How,Footwear,904.41,13
8,9,Person Food,Clothing,185.41,163
9,10,Table Officer,Electronics,651.11,153


In [12]:
df_transactions = pd.read_sql_query('SELECT * FROM transactions', conn)
df_transactions


Unnamed: 0,transaction_id,date,user_id,product_id,quantity,price,payment_type,status


In [13]:
from datetime import date

populate.create_new_transactions(conn, 300, date(2026, 2, 2), 300, [0.7, 0.3])


Current ID: None, therefore, next_transaction_id: 1


291

In [16]:
df_transactions = pd.read_sql_query('SELECT * FROM transactions', conn)
df_transactions


Unnamed: 0,transaction_id,date,user_id,product_id,quantity,price,payment_type,status
0,1,2026-02-02,92,19,1,139.79,mastercard,failed
1,1,2026-02-02,92,11,1,896.73,mastercard,failed
2,1,2026-02-02,92,15,5,562.45,mastercard,failed
3,2,2026-02-02,61,5,3,203.16,mastercard,failed
4,3,2026-02-02,4,7,1,326.51,visa,success
...,...,...,...,...,...,...,...,...
286,236,2026-02-02,48,6,4,1683.08,mastercard,success
287,236,2026-02-02,48,1,4,2530.56,mastercard,success
288,237,2026-02-02,77,16,4,104.48,wire transfer,success
289,238,2026-02-02,8,5,2,135.44,other,success


In [None]:
# Generate invalid records for testing ETL robustness
print("Current database state before error injection:")
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM users")
user_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM products")
product_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM transactions")
transaction_count = cursor.fetchone()[0]

print(f"  Users: {user_count}")
print(f"  Products: {product_count}")
print(f"  Transactions: {transaction_count}")
print()

In [None]:
# Generate invalid records using our error injection script
# This creates 50 invalid records with various data quality issues
invalid_records = generate_invalid.generate_invalid_records(
    count=50, 
    today='2026-02-02'
)

# Display summary of what was generated
generate_invalid.print_summary(invalid_records)

In [None]:
# Insert the invalid records into the database
print("Inserting invalid records into OLTP database...")
inserted_count = generate_invalid.insert_invalid_records(invalid_records)
print(f"Successfully inserted {inserted_count} invalid records")

In [None]:
# Verify the invalid data was inserted by checking final counts
print("\nDatabase state after error injection:")
cursor.execute("SELECT COUNT(*) FROM users")
new_user_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM products")
new_product_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM transactions")
new_transaction_count = cursor.fetchone()[0]

print(f"  Users: {new_user_count} (+{new_user_count - user_count})")
print(f"  Products: {new_product_count} (+{new_product_count - product_count})")  
print(f"  Transactions: {new_transaction_count} (+{new_transaction_count - transaction_count})")
print(f"  Total new records: {(new_user_count - user_count) + (new_product_count - product_count) + (new_transaction_count - transaction_count)}")

## Data Quality Issues Injected

The following types of invalid data were injected to test ETL robustness:

1. **Orphan Transactions** - Transactions referencing non-existent users/products
2. **Invalid Quantities** - Zero or negative quantity transactions  
3. **Price Violations** - Products with prices >= $10,000
4. **Price Mismatches** - Transaction prices not matching product prices
5. **Invalid Payment Types** - Unsupported payment methods (Bitcoin, PayPal, etc.)
6. **Invalid Status Values** - Transaction statuses outside success/failed
7. **Bad Date Formats** - Incorrectly formatted transaction dates
8. **Duplicate Transaction IDs** - Multiple transactions with same ID
9. **Invalid User Data** - Empty names, bad emails, NULL join dates
10. **Invalid Product Data** - Negative stock values

These data quality issues will test the ETL pipeline's ability to:
- Handle orphan records gracefully
- Validate data constraints
- Normalize inconsistent formats
- Detect and handle duplicates
- Apply business rules and data contracts

In [None]:
# Sample some invalid records to see what was created
print("Sample Invalid Records Created:")
print("\n" + "="*50)

# Show some transactions with potential issues
df_sample_transactions = pd.read_sql_query('''
    SELECT transaction_id, date, user_id, product_id, quantity, price, payment_type, status
    FROM transactions 
    WHERE transaction_id IN (
        SELECT transaction_id 
        FROM transactions 
        GROUP BY transaction_id 
        HAVING COUNT(*) > 1
    )
    OR quantity <= 0
    OR payment_type NOT IN ('visa', 'mastercard', 'wire transfer', 'other')
    OR status NOT IN ('success', 'failed')
    LIMIT 10
''', conn)

if not df_sample_transactions.empty:
    print("Sample problematic transactions:")
    print(df_sample_transactions)
else:
    print("No obvious problematic transactions found (may be in other error categories)")

print("\n" + "="*50)

In [None]:
# Close the connection
conn.close()