In [6]:
import boto3
import pandas as pd
from io import BytesIO

session = boto3.Session(profile_name="siqi-dev")
s3 = session.client("s3")

BUCKET = "online-retail-churn-siqi-dev"
KEY = "raw/online_retail/dt=2026-01-12/Online Retail.xlsx"

obj = s3.get_object(Bucket=BUCKET, Key=KEY)
df_raw = pd.read_excel(BytesIO(obj["Body"].read()), engine="openpyxl")

df_raw.shape

(541909, 8)

In [11]:
df_raw.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [7]:
df_raw.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [9]:
df_raw["StockCode"].map(type).value_counts()

StockCode
<class 'int'>    487036
<class 'str'>     54873
Name: count, dtype: int64

In [10]:
df_raw["InvoiceNo"].map(type).value_counts()

InvoiceNo
<class 'int'>    532618
<class 'str'>      9291
Name: count, dtype: int64

In [12]:
df_raw["StockCode"].sample(20)

134884    17012C
137310     21668
487583     22535
403296     23389
189880    79030G
316477    35911A
75488      21866
327647     21891
79096      22108
401885     22950
271240     22725
231416     47566
463322     22939
372082     22456
331580    84029G
288159     82600
136843     84817
9931       21202
402816     23117
351144     23356
Name: StockCode, dtype: object

In [23]:
df_raw["InvoiceNo"].sample(20)

182062     552520
509492     579328
430739     573585
74737      542521
8194       537129
314345     564632
168878     551148
70933      542107
306532     563764
90022      543984
499200     578653
73742     C542376
298339     563016
540279     581475
133281     547772
506652     579147
141875     548522
364901     568676
181719     552508
238200     557937
Name: InvoiceNo, dtype: object

In [24]:
df_raw[["Quantity", "UnitPrice"]].describe()

Unnamed: 0,Quantity,UnitPrice
count,541909.0,541909.0
mean,9.55225,4.611114
std,218.081158,96.759853
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [25]:
df_raw["CustomerID"].isna().mean()

np.float64(0.249266943342886)

**Findings from raw data profiling**

- StockCode and InvoiceNo are business identifiers stored as object type with mixed underlying Python types (string and integer).
- This requires explicit schema normalization before writing to parquet.
- Negative quantities and zero prices exist and represent returns or invalid transactions.
- A non-trivial proportion of rows lack CustomerID, making them unsuitable for customer-level modeling.