# OLTP - Database
The database, as shown in the OLTP.jpg diagram, is meant to represent activity related to an e-commerce site. 

In this file I will:

1- Create the database (create_db.py)

2- Populate it with synthetic data (populate.py)

3- Error inyect the synthetic data (error_inyect.py)

The idea is to have dirty data to prove our ETL incremental pipeline is resilient.

In [1]:
import pandas as pd

import sqlite3
from datetime import date

INITIAL_DATE = date(2026, 2, 2)

In [2]:
conn = sqlite3.connect('ecommerce-OLTP.db')

In [3]:
# Creates db in sqlite.
import scripts.create_db as create_db

create_db.main()

In [4]:
# Populates our db.
import scripts.populate as populate

populate.populate_db_first_time(conn, num_users=100, num_products=20, join_date=INITIAL_DATE)


In [5]:
# Import error injection module
import scripts.generate_invalid as generate_invalid
from datetime import date

In [6]:
# Connect to SQLite database
conn = sqlite3.connect('ecommerce-OLTP.db')

## Testing change functionality

### Users

In [7]:
df_users = pd.read_sql_query('SELECT * FROM users', conn)
df_users

Unnamed: 0,user_id,name,email,join_date
0,1,Jessica Baker,dweber@example.com,2026-02-02
1,2,Kelly Morales,chandleranthony@example.com,2026-02-02
2,3,Candice Duncan,fergusondarlene@example.net,2026-02-02
3,4,James Schultz,hjohnson@example.org,2026-02-02
4,5,Kristen Horne,brian24@example.com,2026-02-02
...,...,...,...,...
95,96,Kayla Stevens,gwendolynanthony@example.com,2026-02-02
96,97,Sean Myers,doris75@example.com,2026-02-02
97,98,Roberta Black,danny29@example.net,2026-02-02
98,99,Tyler Roberts,isaac12@example.net,2026-02-02


In [8]:
populate.change_existent_users(conn, 100)

In [9]:
df_users = pd.read_sql_query('SELECT * FROM users', conn)
df_users

Unnamed: 0,user_id,name,email,join_date
0,1,Jessica Baker,carlawilliams@example.com,2026-02-02
1,2,Kelly Morales,wardkayla@example.net,2026-02-02
2,3,Candice Duncan,samanthacuevas@example.com,2026-02-02
3,4,James Schultz,kbrown@example.net,2026-02-02
4,5,Kristen Horne,donnaclarke@example.net,2026-02-02
...,...,...,...,...
95,96,Kayla Stevens,delgadotaylor@example.net,2026-02-02
96,97,Sean Myers,frenchshaun@example.org,2026-02-02
97,98,Roberta Black,rogersjohn@example.net,2026-02-02
98,99,Tyler Roberts,jenny69@example.org,2026-02-02


### Products

In [10]:
df_products = pd.read_sql_query('SELECT * FROM products', conn)
df_products

Unnamed: 0,product_id,name,category,price,stock
0,1,Night Reach,Accessories,397.53,169
1,2,Spring South,Beauty,37.92,136
2,3,There American,Accessories,161.92,101
3,4,Stand Opportunity,Clothing,17.59,16
4,5,Again American,Clothing,140.46,186
5,6,Range Increase,Footwear,487.48,95
6,7,Character From,Electronics,106.52,23
7,8,Son Home,Accessories,341.07,145
8,9,Suddenly Gas,Accessories,334.61,66
9,10,Suddenly Hold,Footwear,38.61,65


In [11]:
populate.change_existent_products(conn, 20, price_range=[20, 1000])

In [12]:
df_products = pd.read_sql_query('SELECT * FROM products', conn)
df_products

Unnamed: 0,product_id,name,category,price,stock
0,1,Night Reach,Accessories,75.75,169
1,2,Spring South,Beauty,506.91,136
2,3,There American,Accessories,114.93,101
3,4,Stand Opportunity,Clothing,97.22,16
4,5,Again American,Clothing,893.07,186
5,6,Range Increase,Footwear,683.57,95
6,7,Character From,Electronics,898.6,23
7,8,Son Home,Accessories,354.92,145
8,9,Suddenly Gas,Accessories,877.23,66
9,10,Suddenly Hold,Footwear,126.67,65


## Generate transactions

In [13]:
df_transactions = pd.read_sql_query('SELECT * FROM transactions', conn)
df_transactions


Unnamed: 0,transaction_id,date,user_id,product_id,quantity,price,payment_type,status


In [14]:
from datetime import date

populate.create_new_transactions(conn, 300, date(2026, 2, 2), 300, [0.7, 0.3])


272

In [15]:
df_transactions = pd.read_sql_query('SELECT * FROM transactions', conn)
df_transactions


Unnamed: 0,transaction_id,date,user_id,product_id,quantity,price,payment_type,status
0,1,2026-02-02,46,1,3,227.25,visa,success
1,2,2026-02-02,91,2,1,506.91,mastercard,success
2,3,2026-02-02,53,15,1,552.31,visa,failed
3,3,2026-02-02,53,20,5,396.25,visa,failed
4,3,2026-02-02,53,8,5,1774.60,visa,failed
...,...,...,...,...,...,...,...,...
267,212,2026-02-02,98,1,3,227.25,wire transfer,failed
268,213,2026-02-02,6,10,4,506.68,mastercard,failed
269,214,2026-02-02,14,1,1,75.75,visa,success
270,215,2026-02-02,61,17,3,202.17,visa,success


In [16]:
# Generate invalid records for testing ETL robustness
print("Current database state before error injection:")
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM users")
user_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM products")
product_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM transactions")
transaction_count = cursor.fetchone()[0]

print(f"  Users: {user_count}")
print(f"  Products: {product_count}")
print(f"  Transactions: {transaction_count}")
print()

Current database state before error injection:
  Users: 100
  Products: 20
  Transactions: 272



In [23]:
# Generate invalid records using our error injection script
# This creates 50 invalid records with various data quality issues
invalid_records = generate_invalid.generate_invalid_records(
    count=100, 
    today='2026-02-02'
)

invalid_records

{'users': [{'user_id': 116,
   'name': 'User 37',
   'email': 'user116@example.com',
   'join_date': None},
  {'user_id': 117,
   'name': 'User 83',
   'email': 'user@',
   'join_date': '2025-01-15'},
  {'user_id': 118,
   'name': 'User 13',
   'email': 'user @email.com',
   'join_date': '2025-01-15'},
  {'user_id': 119,
   'name': 'User 73',
   'email': 'user.example.com',
   'join_date': '2025-01-15'},
  {'user_id': 120,
   'name': 'User 90',
   'email': 'user120@example.com',
   'join_date': None},
  {'user_id': 121,
   'name': 'User 9',
   'email': 'user @email.com',
   'join_date': '2025-01-15'},
  {'user_id': 122,
   'name': 'User 40',
   'email': 'user.example.com',
   'join_date': '2025-01-15'},
  {'user_id': 123,
   'name': 'User 42',
   'email': 'user123@example.com',
   'join_date': None},
  {'user_id': 124,
   'name': 'User 37',
   'email': 'user@',
   'join_date': '2025-01-15'},
  {'user_id': 125,
   'name': 'User 47',
   'email': 'user125@example.com',
   'join_date': Non

In [19]:
# Insert the invalid records into the database
print("Inserting invalid records into OLTP database...")
inserted_count = generate_invalid.insert_invalid_records(invalid_records)
print(f"Successfully inserted {inserted_count} invalid records")

Inserting invalid records into OLTP database...
Successfully inserted 50 invalid records


In [20]:
# Verify the invalid data was inserted by checking final counts
print("\nDatabase state after error injection:")
cursor.execute("SELECT COUNT(*) FROM users")
new_user_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM products")
new_product_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM transactions")
new_transaction_count = cursor.fetchone()[0]

print(f"  Users: {new_user_count} (+{new_user_count - user_count})")
print(f"  Products: {new_product_count} (+{new_product_count - product_count})")  
print(f"  Transactions: {new_transaction_count} (+{new_transaction_count - transaction_count})")
print(f"  Total new records: {(new_user_count - user_count) + (new_product_count - product_count) + (new_transaction_count - transaction_count)}")


Database state after error injection:
  Users: 115 (+15)
  Products: 28 (+8)
  Transactions: 299 (+27)
  Total new records: 50


## Data Quality Issues Injected

The following types of invalid data were injected to test ETL robustness:

1. **Orphan Transactions** - Transactions referencing non-existent users/products
2. **Invalid Quantities** - Zero or negative quantity transactions  
3. **Price Violations** - Products with prices >= $10,000
4. **Price Mismatches** - Transaction prices not matching product prices
5. **Invalid Payment Types** - Unsupported payment methods (Bitcoin, PayPal, etc.)
6. **Invalid Status Values** - Transaction statuses outside success/failed
7. **Bad Date Formats** - Incorrectly formatted transaction dates
8. **Duplicate Transaction IDs** - Multiple transactions with same ID
9. **Invalid User Data** - Empty names, bad emails, NULL join dates
10. **Invalid Product Data** - Negative stock values

These data quality issues will test the ETL pipeline's ability to:
- Handle orphan records gracefully
- Validate data constraints
- Normalize inconsistent formats
- Detect and handle duplicates
- Apply business rules and data contracts

Sample Invalid Records Created:

Sample problematic transactions:
   transaction_id        date  user_id  product_id  quantity    price  \
0               3  2026-02-02       53          15         1   552.31   
1               3  2026-02-02       53          20         5   396.25   
2               3  2026-02-02       53           8         5  1774.60   
3               3  2026-02-02       53          12         2   220.30   
4               5  2026-02-02       84          11         4  2579.00   
5               5  2026-02-02       84          13         4  3926.68   
6               5  2026-02-02       84          12         2   220.30   
7               5  2026-02-02       84           8         2   709.84   
8               9  2026-02-02       84          16         2  1815.72   
9               9  2026-02-02       84           4         3   291.66   

    payment_type   status  
0           visa   failed  
1           visa   failed  
2           visa   failed  
3           visa  

In [None]:
# Close the connection
conn.close()

In [28]:
df_transactions = pd.read_sql_query('SELECT * FROM transactions', conn)
df_transactions


Unnamed: 0,transaction_id,date,user_id,product_id,quantity,price,payment_type,status
0,1,2026-02-02,46,1,3,227.25,visa,success
1,2,2026-02-02,91,2,1,506.91,mastercard,success
2,3,2026-02-02,53,15,1,552.31,visa,failed
3,3,2026-02-02,53,20,5,396.25,visa,failed
4,3,2026-02-02,53,8,5,1774.60,visa,failed
...,...,...,...,...,...,...,...,...
294,5734,02-02-2026,9,13,5,16.70,other,failed
295,8491,2026-02-02,66,9,2,75.47,mastercard,unknown
296,3718,2026-02-02,59,1,-4,72.90,wire transfer,success
297,7990,2026-02-02,50,9,-2,22.84,visa,success
