# Welcome to the Data Engineering Lab
Run the cells below to generate the datasets needed for the advanced modules.

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import os

# 1. Setup Data Directory
# Ensure the folder exists so we don't get an error when saving
data_dir = '/home/jovyan/data'
os.makedirs(data_dir, exist_ok=True)

fake = Faker()
print("Generating Synthetic Data... This may take a minute.")

# 2. Generate 'Big' Data (1 Million Rows for Performance Tuning)
# We will simulate a 'Trips' dataset similar to NYC Taxi to teach OLAP concepts
row_count = 1_000_000

# Create a DataFrame with random data
df = pd.DataFrame({
    'trip_id': range(row_count),
    'vendor_id': np.random.choice(['V1', 'V2'], row_count),
    'pickup_at': pd.date_range(start='2024-01-01', periods=row_count, freq='S'),
    'passenger_count': np.random.randint(1, 7, row_count),
    'trip_distance': np.random.uniform(0.5, 20.0, row_count),
    'fare_amount': np.random.uniform(5.0, 100.0, row_count)
})

# 3. Introduce 'Dirty Data' (Crucial for Data Engineering Drills)
# We intentionally corrupt the data so students have to clean it later.

# Set the first 100 fares to NaN (NULL)
df.loc[0:100, 'fare_amount'] = np.nan 

# Create outliers: Set some trips to be impossibly long (5000 miles)
df.loc[101:110, 'trip_distance'] = 5000 

print(f"Generated {row_count} rows in memory.")

# 4. Save as Parquet (For DuckDB / Analytical Modules)
# Parquet is column-oriented and perfect for teaching DuckDB
parquet_path = f"{data_dir}/trips_1m.parquet"
df.to_parquet(parquet_path)
print(f"Saved Parquet: {parquet_path}")

# 5. Save as CSV (For Postgres Loading Drills)
# CSV is row-oriented and text-based, good for teaching COPY command bottlenecks
csv_path = f"{data_dir}/trips_1m.csv"
df.to_csv(csv_path, index=False)
print(f"Saved CSV: {csv_path}")

Generating Synthetic Data... This may take a minute.
Generated 1000000 rows in memory.


  'pickup_at': pd.date_range(start='2024-01-01', periods=row_count, freq='S'),


Saved Parquet: /home/jovyan/data/trips_1m.parquet
Saved CSV: /home/jovyan/data/trips_1m.csv
