# Week 2 — ETL/ELT Mini Pipeline

**Objectives**
- Extract from CSV/JSON/APIs
- Transform: clean, cast, enrich
- Load into SQLite or DuckDB, run analytics queries

## 0) Setup

In [None]:
# Colab users can install if needed:
# !pip -q install pandas duckdb requests
import pandas as pd, duckdb, sqlite3, io, json
print("pandas:", pd.__version__)

## 1) Extract

In [None]:
csv_text = '''order_id,customer,country,amount,ts
1001,Alice,US,120.50,2024-06-01
1002,Bob,UK,85.00,2024-06-02
1003,Chandra,IN,NaN,2024-06-02
1004,Diego,US,43.20,2024-06-03
1005,Eva,DE,69.99,2024-06-03
'''
df_raw = pd.read_csv(io.StringIO(csv_text))
df_raw.head()

## 2) Transform

In [None]:
df = df_raw.copy()
df['amount'] = pd.to_numeric(df['amount'], errors='coerce').fillna(df['amount'].median())
df['ts'] = pd.to_datetime(df['ts'])
df['day'] = df['ts'].dt.date
df['is_us'] = (df['country'] == 'US').astype(int)
df

## 3A) Load → SQLite + queries

In [None]:
conn = sqlite3.connect(":memory:")
df.to_sql("orders", conn, index=False, if_exists="replace")

q_rev_by_country = pd.read_sql_query("""
SELECT country, ROUND(SUM(amount),2) as revenue
FROM orders GROUP BY country ORDER BY revenue DESC
""", conn)

q_daily = pd.read_sql_query("""
SELECT day, COUNT(*) as orders, ROUND(SUM(amount),2) as revenue
FROM orders GROUP BY day ORDER BY day
""", conn)

display(q_rev_by_country); display(q_daily)

## 3B) Load → DuckDB + query

In [None]:
con = duckdb.connect(database=':memory:')
con.register('orders_df', df)
duckdb_rev = con.execute('SELECT country, ROUND(SUM(amount),2) AS revenue FROM orders_df GROUP BY country ORDER BY revenue DESC').fetchdf()
duckdb_rev

## 4) Save artifacts

In [None]:
df.to_csv('orders_clean.csv', index=False)
with open('orders_schema.json','w') as f:
    json.dump({"columns": df.dtypes.astype(str).to_dict()}, f, indent=2)
print("Saved: orders_clean.csv, orders_schema.json")