In [129]:
#Package to download datasets from kaggle
#%pip install kagglehub
# %pip install fastparquet

import kagglehub
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

# Download latest version
path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")
PROJECT_ROOT = Path("..")
DATA_CLEANED_DIR = PROJECT_ROOT / "data_cleaned"
DATA_CLEANED_DIR.mkdir(exist_ok=True)

print("Path to dataset files:", path)



Path to dataset files: C:\Users\simon\.cache\kagglehub\datasets\olistbr\brazilian-ecommerce\versions\2


In [130]:
def data_profile(df):
    """
    Displays a quick profile of a pandas DataFrame.
    Purpose: provide a high-level overview of the dataset structure,
    data quality, and basic statistics (quick EDA).
    """

    # Shape of the DataFrame: (number of rows, number of columns)
    print(f"Data shape: {df.shape}")

    # List of column names
    print(f"\nData columns: {df.columns.tolist()}")

    # Data types of each column (int, float, object, datetime, etc.)
    print(f"\nData types: {df.dtypes}")

    # Descriptive statistics for numerical variables
    # Includes: count, mean, std, min, quartiles, and max
    print(f"\nData description: {df.describe()}")

    # Number of missing values per column
    # Helps identify data quality issues
    print(f"\nData missing values: {df.isnull().sum()}")

    # Number of duplicated rows in the DataFrame
    # Important to detect potential bias in analysis
    print(f"\nData duplicates: {df.duplicated().sum()}")


def report(df, name):
    """
    Display a quick validation report for a DataFrame.

    Purpose:
    - Confirm that the dataset has been correctly processed
    - Provide a lightweight sanity check after cleaning or transformation steps
    """

    # Display dataset name for traceability in the pipeline
    print(f"✅ {name}.csv saved!")

    # Print the DataFrame shape (rows, columns) to validate size expectations
    print("Shape:", df.shape)

    # Display the first rows to visually inspect the output
    print(df.head())


def export_clean(df, name, out_dir=DATA_CLEANED_DIR):
    """
    Export a cleaned DataFrame to both CSV and Parquet formats.

    Purpose:
    - CSV: human-readable format for inspection and versioning
    - Parquet: optimized columnar format for performance and type safety
    """
    
    # Ensure output directory exists
    assert out_dir.exists(), "Output directory does not exist"


    # Build output file paths
    csv_path = os.path.join(out_dir, f"{name}.csv")
    parquet_path = os.path.join(out_dir, f"{name}.parquet")

    # Export to CSV (universal, easy to inspect)
    df.to_csv(csv_path, index=False)

    # Export to Parquet using pyarrow
    # This approach bypasses some pandas/pyarrow compatibility issues
    # and ensures a robust Parquet write
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, parquet_path)

    # Confirmation message
    print(f"✅ Saved: {csv_path} and {parquet_path}")



### 1. Orders Table

#### 1.1 Load raw data




In [131]:
#Load the orders dataset
orders_raw = pd.read_csv(os.path.join(path, "olist_orders_dataset.csv"))


#### 1.2 Profiling

- Reviewed dataset structure, columns, and data types
- Analyzed order status distribution
- Identified key timestamps for order lifecycle analysis
- Flagged early-stage statuses not relevant for delivery performance analysis

In [132]:
# Quick data profile
data_profile(orders_raw)

Data shape: (99441, 8)

Data columns: ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

Data types: order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

Data description:                                 order_id                       customer_id  \
count                              99441                             99441   
unique                             99441                             99441   
top     66dea50a8b16d9b4dee7af250b4be1a5  edb027a75a1449115f6b43211ae02a24   
freq                                   1                                 1   

       order_s

#### 1.3 Data Cleaning

- Selected columns required for order lifecycle and delivery analysis
- Converted timestamp fields to datetime format
- Standardized order_status values to lowercase
- Excluded early-stage orders (`created`, `approved`)
- Applied data quality checks (primary key uniqueness, non-null status)


In [133]:
# ------------------------------------------------------------------------------
# Column selection
# Keep only the columns required for order lifecycle and delivery analysis
# Use .copy() to avoid pandas chained assignment issues (SettingWithCopyWarning)
# ------------------------------------------------------------------------------
orders_clean = orders_raw[[
    "order_id",
    "customer_id",
    "order_status",
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]].copy()

# ------------------------------------------------------------------------------
# Date parsing
# Convert timestamp columns to pandas datetime for time-based analysis
# Invalid or malformed values are coerced to NaT (missing)
# ------------------------------------------------------------------------------
date_cols = [
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]
orders_clean[date_cols] = orders_clean[date_cols].apply(pd.to_datetime, errors="coerce")

# ------------------------------------------------------------------------------
# Categorical normalization
# Standardize order_status values to lowercase to prevent case inconsistencies
# ------------------------------------------------------------------------------
orders_clean["order_status"] = orders_clean["order_status"].str.lower()

# ------------------------------------------------------------------------------
# Business rule filtering
# Exclude early-stage orders that are not relevant for delivery/performance analysis
# ------------------------------------------------------------------------------
EXCLUDED_ORDER_STATUSES = {"created", "approved"}
orders_clean = orders_clean.loc[
    ~orders_clean["order_status"].isin(EXCLUDED_ORDER_STATUSES)
]

#### 1.4 Data quality checks

In [134]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Stop the pipeline early if key assumptions are violated
# ------------------------------------------------------------------------------
assert orders_clean["order_id"].notna().all(), "order_id contains missing values"
assert orders_clean["order_id"].is_unique, "order_id is not unique"
assert orders_clean["order_status"].notna().all(), "order_status contains missing values"

#### 1.5 Export

**Output**
- `data_cleaned/orders_clean.csv`
- `data_cleaned/orders_clean.parquet`

In [135]:
# ------------------------------------------------------------------------------
# Export cleaned dataset
# Save the cleaned table for downstream analysis and dashboarding
# ------------------------------------------------------------------------------
export_clean(orders_clean, "orders_clean")

# ------------------------------------------------------------------------------
# Sanity check report
# Quick visual validation: shape + head
# ------------------------------------------------------------------------------
report(orders_clean, "orders_clean")

✅ Saved: ..\data_cleaned\orders_clean.csv and ..\data_cleaned\orders_clean.parquet
✅ orders_clean.csv saved!
Shape: (99434, 6)
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp order_delivered_customer_date  \
0    delivered      2017-10-02 10:56:33           2017-10-10 21:25:13   
1    delivered      2018-07-24 20:41:37           2018-08-07 15:27:45   
2    delivered      2018-08-08 08:38:49           2018-08-17 18:06:29   
3    delivered      2017-11-18 19:28:06           2017-12-02 00:28:42   
4    delivered      2018-02-13 21:18:39           2018-02-16 18:17:02

### 2. Customers Table

#### 2.1 Load raw data

In [136]:
# Load the customers dataset
customers_raw = pd.read_csv(os.path.join(path, "olist_customers_dataset.csv"))

#### 2.2 Profiling

- Inspected dataset structure and data types
- Checked for missing values
- Identified customer_id as the primary key
- Identified customer_state for regional analysis

In [137]:
# Quick data profile
data_profile(customers_raw) 

Data shape: (99441, 5)

Data columns: ['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

Data types: customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

Data description:        customer_zip_code_prefix
count              99441.000000
mean               35137.474583
std                29797.938996
min                 1003.000000
25%                11347.000000
50%                24416.000000
75%                58900.000000
max                99990.000000

Data missing values: customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

Data duplicates: 0


#### 2.3 Data cleaning

- Retained only customer_id and customer_state
- Applied data quality checks (primary key uniqueness, non-null state codes)
- Validated customer_state format

In [138]:
# ------------------------------------------------------------------------------
# Column selection
# Keep only the columns required for customer-level and regional analysis
# ------------------------------------------------------------------------------
customers_clean = customers_raw[[
    "customer_id",
    "customer_state"
]].copy()

#### 2.4 Data quality checks

In [139]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Validate key assumptions before exporting the dataset
# ------------------------------------------------------------------------------

# Primary key checks
assert customers_clean["customer_id"].notna().all(), \
    "customer_id contains missing values"
assert customers_clean["customer_id"].is_unique, \
    "customer_id is not unique"

# Categorical integrity checks
assert customers_clean["customer_state"].notna().all(), \
    "customer_state contains missing values"

# Optional: validate state code format (Brazilian states = 2-letter codes)
assert customers_clean["customer_state"].str.len().eq(2).all(), \
    "Invalid customer_state code detected"

#### 2.5 Export

**Output**
- `data_cleaned/customers_clean.csv`
- `data_cleaned/customers_clean.parquet`

In [140]:
# ------------------------------------------------------------------------------
# Export cleaned dataset
# Ensure the output directory exists and save the cleaned table
# ------------------------------------------------------------------------------
# os.makedirs("data_cleaned", exist_ok=True)
export_clean(customers_clean, "customers_clean")

# ------------------------------------------------------------------------------
# Sanity checks
# Display basic information to validate the cleaning process
# ------------------------------------------------------------------------------
report(customers_clean, "customers_clean")

✅ Saved: ..\data_cleaned\customers_clean.csv and ..\data_cleaned\customers_clean.parquet
✅ customers_clean.csv saved!
Shape: (99441, 2)
                        customer_id customer_state
0  06b8999e2fba1a1fbc88172c00ba8bc7             SP
1  18955e83d337fd6b2def6b18a428ac77             SP
2  4e7b3e00288586ebd08712fdd0374a03             SP
3  b2b6027bc5c5109e529d4dc6358b12c3             SP
4  4f2d8ab171c80ec8364f7c12e35b23ad             SP


### 3. Order Items Table

#### 3.1 Load raw data





In [141]:
order_items_raw = pd.read_csv(os.path.join(path, "olist_order_items_dataset.csv"))

#### 3.2 Profiling

- Reviewed dataset structure and data types
- Analyzed price and freight_value distributions
- Identified key columns for revenue and shipping cost analysis

In [142]:

# ------------------------------------------------------------------------------
# Order items – Data profiling
# Explore structure, data quality, and key numerical metrics
# ------------------------------------------------------------------------------

# Generate a high-level overview of the raw order_items dataset
# (shape, columns, data types, missing values, duplicates)
data_profile(order_items_raw)

# ------------------------------------------------------------------------------
# Numerical exploration: price
# Analyze distribution and summary statistics to understand revenue patterns
# ------------------------------------------------------------------------------
print("\nPrice summary:")
print(order_items_raw["price"].describe())

# ------------------------------------------------------------------------------
# Numerical exploration: freight_value
# Analyze shipping cost distribution and identify potential outliers
# ------------------------------------------------------------------------------
print("\nFreight summary:")
print(order_items_raw["freight_value"].describe())


Data shape: (112650, 7)

Data columns: ['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

Data types: order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

Data description:        order_item_id          price  freight_value
count  112650.000000  112650.000000  112650.000000
mean        1.197834     120.653739      19.990320
std         0.705124     183.633928      15.806405
min         1.000000       0.850000       0.000000
25%         1.000000      39.900000      13.080000
50%         1.000000      74.990000      16.260000
75%         1.000000     134.900000      21.150000
max        21.000000    6735.000000     409.680000

Data missing values: order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_li

#### 3.3 Data Cleaning

- Selected relevant columns for analysis
- Converted price and freight_value to numeric types
- Applied data quality checks (non-null, non-negative values)


In [143]:
# ------------------------------------------------------------------------------
# Column selection
# Keep only the columns required for revenue and shipping cost analysis
# Use .copy() to avoid chained assignment issues
# ------------------------------------------------------------------------------
order_items_clean = order_items_raw[[
    "order_id",
    "product_id",
    "price",
    "freight_value"
]].copy()

# ------------------------------------------------------------------------------
# Data type conversion
# Ensure price and freight_value are numeric for reliable aggregations
# Invalid values are coerced to NaN
# ------------------------------------------------------------------------------
order_items_clean["price"] = pd.to_numeric(
    order_items_clean["price"], errors="coerce"
)
order_items_clean["freight_value"] = pd.to_numeric(
    order_items_clean["freight_value"], errors="coerce"
)

#### 3.4 Data quality checks

In [144]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Validate business assumptions before exporting the dataset
# ------------------------------------------------------------------------------
assert order_items_clean["price"].notna().all(), "Missing values detected in price"
assert order_items_clean["freight_value"].notna().all(), "Missing values detected in freight_value"

assert (order_items_clean["price"] >= 0).all(), "Negative price values detected"
assert (order_items_clean["freight_value"] >= 0).all(), "Negative freight values detected"

#### 3.5 Export

**Output**
- `data_cleaned/order_items_clean.csv`
- `data_cleaned/order_items_clean.parquet`

In [145]:
# ------------------------------------------------------------------------------
# Export cleaned dataset
# Save the cleaned order items table for downstream analysis
# ------------------------------------------------------------------------------
export_clean(order_items_clean, "order_items_clean")

# ------------------------------------------------------------------------------
# Sanity check report
# Quick validation of shape and sample rows
# ------------------------------------------------------------------------------
report(order_items_clean, "order_items_clean")

✅ Saved: ..\data_cleaned\order_items_clean.csv and ..\data_cleaned\order_items_clean.parquet
✅ order_items_clean.csv saved!
Shape: (112650, 4)
                           order_id                        product_id   price  \
0  00010242fe8c5a6d1ba2dd792cb16214  4244733e06e7ecb4970a6e2683c13e61   58.90   
1  00018f77f2f0320c557190d7a144bdd3  e5f2d52b802189ee658865ca93d83a8f  239.90   
2  000229ec398224ef6ca0657da4fc703e  c777355d18b72b67abbeef9df44fd0fd  199.00   
3  00024acbcdf0a6daa1e931b038114c75  7634da152a4610f1595efa32f14722fc   12.99   
4  00042b26cf59d7ce69dfabb4e55b4fd9  ac6c3623068f30de03045865e4e10089  199.90   

   freight_value  
0          13.29  
1          19.93  
2          17.87  
3          12.79  
4          18.14  


### 4. Products Table

#### 4.1 Load raw data




In [146]:
# ------------------------------------------------------------------------------
# Load products dataset
# Read the raw products table from the source CSV file
# ------------------------------------------------------------------------------
products_raw = pd.read_csv(os.path.join(path, "olist_products_dataset.csv"))

#### 4.2 Profiling

- Inspected dataset structure and category distribution
- Identified product_category_name as the main analytical dimension
- Flagged descriptive and physical attributes as out of scope


In [147]:
# ------------------------------------------------------------------------------
# Initial data profiling
# Generate a high-level overview of the dataset structure and data quality
# ------------------------------------------------------------------------------
data_profile(products_raw)

# ------------------------------------------------------------------------------
# Categorical exploration
# Analyze product categories to understand cardinality and sample values
# ------------------------------------------------------------------------------
print("\nUnique categories:", products_raw["product_category_name"].nunique())

# Display a sample of category values for quick inspection
print(products_raw["product_category_name"].unique()[:15])  # first 15 examples

Data shape: (32951, 9)

Data columns: ['product_id', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']

Data types: product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

Data description:        product_name_lenght  product_description_lenght  product_photos_qty  \
count         32341.000000                32341.000000        32341.000000   
mean             48.476949                  771.495285            2.188986   
std              10.245741                  635.115225            1.736766   
min               5.000000                    4.000000           

#### 4.3 Data Cleaning

- Retained product_id and product_category_name
- Replaced missing categories with "unknown"
- Standardized category values
- Applied data quality checks (primary key uniqueness, non-empty categories)


In [148]:
# ------------------------------------------------------------------------------
# Column selection
# Keep only the fields required for product-level and category analysis
# ------------------------------------------------------------------------------
products_clean = products_raw[[
    "product_id",
    "product_category_name"
]].copy()

# ------------------------------------------------------------------------------
# Missing value handling
# Replace missing product categories with "unknown" to preserve completeness
# and avoid issues during grouping or joins
# ------------------------------------------------------------------------------
products_clean["product_category_name"] = (
    products_clean["product_category_name"]
    .fillna("unknown")
    .astype(str)
    .str.strip()
    .str.lower()
)

#### 4.4 Data quality checks

In [149]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Validate key assumptions before exporting the dataset
# ------------------------------------------------------------------------------

# Primary key checks
assert products_clean["product_id"].notna().all(), "product_id contains missing values"
assert products_clean["product_id"].is_unique, "product_id is not unique"

# Category checks
assert products_clean["product_category_name"].notna().all(), \
    "product_category_name contains missing values after fillna"

# No empty strings after cleaning
assert (products_clean["product_category_name"].str.len() > 0).all(), \
    "Empty product_category_name values detected"



#### 4.5 Export

**Output**
- `data_cleaned/products_clean.csv`
- `data_cleaned/products_clean.parquet`

In [150]:
# ------------------------------------------------------------------------------
# Export cleaned dataset
# Save the cleaned products table for downstream analysis
# ------------------------------------------------------------------------------
export_clean(products_clean, "products_clean")

# ------------------------------------------------------------------------------
# Sanity checks
# Quick validation of shape and sample rows
# ------------------------------------------------------------------------------
report(products_clean, "products_clean")

✅ Saved: ..\data_cleaned\products_clean.csv and ..\data_cleaned\products_clean.parquet
✅ products_clean.csv saved!
Shape: (32951, 2)
                         product_id  product_category_name
0  1e9e8ef04dbcff4541ed26657ea517e5             perfumaria
1  3aa071139cb16b67ca9e5dea641aaa2f                  artes
2  96bd76ec8810374ed1b65e291975717f          esporte_lazer
3  cef67bcfe19066a932b7673e239eb23d                  bebes
4  9dc1a7de274444849c219cff195d0b71  utilidades_domesticas


### 5. Product Category Translation Table

#### 5.1 Load raw data

In [151]:
# ------------------------------------------------------------------------------
# Load category translation table
# Read the product category translation lookup table
# (Portuguese → English)
# ------------------------------------------------------------------------------
translation_raw = pd.read_csv(
    os.path.join(path, "product_category_name_translation.csv")
)

#### 5.2 Profiling

- Reviewed lookup table structure
- Confirmed absence of missing values
- Identified the table as a Portuguese-to-English category mapping

In [152]:
# ------------------------------------------------------------------------------
# Initial data profiling
# Generate a high-level overview to validate structure and data quality
# ------------------------------------------------------------------------------
data_profile(translation_raw)


Data shape: (71, 2)

Data columns: ['product_category_name', 'product_category_name_english']

Data types: product_category_name            object
product_category_name_english    object
dtype: object

Data description:        product_category_name product_category_name_english
count                     71                            71
unique                    71                            71
top             beleza_saude                 health_beauty
freq                       1                             1

Data missing values: product_category_name            0
product_category_name_english    0
dtype: int64

Data duplicates: 0


#### 5.3 Data Cleaning

- Renamed columns for clarity
- Standardized text fields
- Applied data quality checks (unique mapping, non-null values)


In [153]:
# ------------------------------------------------------------------------------
# Column renaming
# Rename columns to improve clarity and semantic meaning
# ------------------------------------------------------------------------------
translation_clean = translation_raw.rename(columns={
    "product_category_name": "category_portuguese",
    "product_category_name_english": "category_english"
}).copy()

# ------------------------------------------------------------------------------
# String normalization
# Standardize text fields to reduce join mismatches (spaces/case)
# ------------------------------------------------------------------------------
translation_clean["category_portuguese"] = (
    translation_clean["category_portuguese"]
    .astype(str)
    .str.strip()
    .str.lower()
)

translation_clean["category_english"] = (
    translation_clean["category_english"]
    .astype(str)
    .str.strip()
    .str.lower()
)

#### 5.4  Data quality checks

In [154]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Validate lookup integrity before exporting the dataset
# ------------------------------------------------------------------------------

# No missing values (expected for a translation lookup table)
assert translation_clean["category_portuguese"].notna().all(), \
    "category_portuguese contains missing values"
assert translation_clean["category_english"].notna().all(), \
    "category_english contains missing values"

# No empty strings after cleaning
assert (translation_clean["category_portuguese"].str.len() > 0).all(), \
    "Empty category_portuguese values detected"
assert (translation_clean["category_english"].str.len() > 0).all(), \
    "Empty category_english values detected"

# Key uniqueness: one Portuguese category should map to one English category
assert translation_clean["category_portuguese"].is_unique, \
    "Duplicate category_portuguese detected (mapping should be 1-to-1)"

# Optional: sanity check for duplicates on the full pair (PT, EN)
assert translation_clean.duplicated(
    subset=["category_portuguese", "category_english"]
).sum() == 0, "Duplicate translation pairs detected"

#### 5.5 Export

**Output**
- `data_cleaned/category_translation_clean.csv`
- `data_cleaned/category_translation_clean.parquet`

In [155]:
# ------------------------------------------------------------------------------
# Export cleaned translation table
# Ensure the output directory exists and save the cleaned lookup table
# ------------------------------------------------------------------------------
export_clean(translation_clean, "translation_clean")

# ------------------------------------------------------------------------------
# Sanity checks
# Quick validation of shape and sample rows
# ------------------------------------------------------------------------------
report(translation_clean, "translation_clean")

✅ Saved: ..\data_cleaned\translation_clean.csv and ..\data_cleaned\translation_clean.parquet
✅ translation_clean.csv saved!
Shape: (71, 2)
      category_portuguese       category_english
0            beleza_saude          health_beauty
1  informatica_acessorios  computers_accessories
2              automotivo                   auto
3         cama_mesa_banho         bed_bath_table
4        moveis_decoracao        furniture_decor


### 6. Reviews Table

#### 6.1 Load raw data

In [156]:
reviews_raw = pd.read_csv(
    os.path.join(path, "olist_order_reviews_dataset.csv")
)


#### 6.2 Profiling

- Reviewed dataset structure, columns, and data types
- Identified `order_id` as the join key with the orders table
- Analyzed the distribution of `review_score` to assess customer satisfaction patterns
- Verified that review scores follow a discrete rating scale

In [157]:
data_profile(reviews_raw)

print("\nReview score distribution:")
print(reviews_raw['review_score'].value_counts())

Data shape: (99224, 7)

Data columns: ['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message', 'review_creation_date', 'review_answer_timestamp']

Data types: review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object

Data description:        review_score
count  99224.000000
mean       4.086421
std        1.347579
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000

Data missing values: review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

Data duplicates: 0

Review score distribution:
review_score
5    57328
4    19142
1    1

#### 6.3 Data Cleaning

- Selected only the columns required for satisfaction analysis:
  - `order_id`
  - `review_score`
- Ensured `review_score` values are numeric


In [158]:
reviews_clean = reviews_raw[["order_id", "review_score"]].copy()
    

#### 6.4 Data quality checks

- Applied data quality checks:
  - non-null `order_id`
  - non-null `review_score`
  - review scores constrained to the expected range (1–5)

In [159]:
# ------------------------------------------------------------------------------
# Data quality checks (assertions)
# Validate assumptions for reviews data
# ------------------------------------------------------------------------------

# order_id must be present for joins with orders
assert reviews_clean["order_id"].notna().all(), \
    "order_id contains missing values in reviews table"

# review_score must be present
assert reviews_clean["review_score"].notna().all(), \
    "review_score contains missing values"

# review_score must be numeric
assert pd.api.types.is_numeric_dtype(reviews_clean["review_score"]), \
    "review_score is not numeric"

# review_score must be within expected rating scale (1 to 5)
assert reviews_clean["review_score"].between(1, 5).all(), \
    "review_score outside expected range (1–5)"


#### 6.5 Export

**Output**
- `data_cleaned/reviews_clean.csv`
- `data_cleaned/reviews_clean.parquet`

In [160]:
# ------------------------------------------------------------------------------
# Export cleaned translation table
# Ensure the output directory exists and save the cleaned lookup table
# ------------------------------------------------------------------------------
export_clean(reviews_clean, "reviews_clean")

# ------------------------------------------------------------------------------
# Sanity checks
# Quick validation of shape and sample rows
# ------------------------------------------------------------------------------
report(reviews_clean, "reviews_clean")

✅ Saved: ..\data_cleaned\reviews_clean.csv and ..\data_cleaned\reviews_clean.parquet
✅ reviews_clean.csv saved!
Shape: (99224, 2)
                           order_id  review_score
0  73fc7af87114b39712e6da79b0a377eb             4
1  a548910a1c6147796b98fdf73dbeba33             5
2  f9e4b658b201a9f2ecdecbb34bed034b             5
3  658677c97b385a9be170737859d3511b             5
4  8e6bfb81e283fa7e4f11123a3fb894f1             5
