In [1]:
pip install scikit-learn

Active code page: 1252Note: you may need to restart the kernel to use updated packages.



In [4]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score
)

In [5]:
DATA_PATH = "../data/raw/"

orders = pd.read_csv(DATA_PATH + "olist_orders_dataset.csv")
reviews = pd.read_csv(DATA_PATH + "olist_order_reviews_dataset.csv")
order_items = pd.read_csv(DATA_PATH + "olist_order_items_dataset.csv")

df = orders.merge(reviews, on="order_id", how="inner")
df["low_rating"] = (df["review_score"] <= 2).astype(int)

# Delivery delay
orders["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"])
orders["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"])
orders["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"])

orders["delivery_delay_days"] = (
    orders["order_delivered_customer_date"] -
    orders["order_estimated_delivery_date"]
).dt.days

df = df.merge(
    orders[["order_id", "delivery_delay_days"]],
    on="order_id",
    how="left"
)

df["delivery_delay_days"] = df["delivery_delay_days"].clip(-60, 30)

# Monetary
price_df = (
    order_items.groupby("order_id")[["price", "freight_value"]]
    .sum()
    .reset_index()
)

df = df.merge(price_df, on="order_id", how="left")

# Chronological split
df["order_purchase_timestamp"] = pd.to_datetime(df["order_purchase_timestamp"])
df = df.sort_values("order_purchase_timestamp")

split_index = int(len(df) * 0.8)

train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

# Seller risk
order_seller = order_items[["order_id", "seller_id"]].drop_duplicates()
train_df = train_df.merge(order_seller, on="order_id", how="left")
test_df = test_df.merge(order_seller, on="order_id", how="left")

seller_risk = (
    train_df.groupby("seller_id")["low_rating"]
    .mean()
    .reset_index()
)

seller_risk.columns = ["seller_id", "seller_historical_risk"]

train_df = train_df.merge(seller_risk, on="seller_id", how="left")
test_df = test_df.merge(seller_risk, on="seller_id", how="left")

global_mean = train_df["low_rating"].mean()

train_df["seller_historical_risk"] = train_df["seller_historical_risk"].fillna(global_mean)
test_df["seller_historical_risk"] = test_df["seller_historical_risk"].fillna(global_mean)

# Final features
feature_cols = [
    "delivery_delay_days",
    "price",
    "freight_value",
    "seller_historical_risk"
]

X_train = train_df[feature_cols]
y_train = train_df["low_rating"]

X_test = test_df[feature_cols]
y_test = test_df["low_rating"]

print("Data ready:", X_train.shape, X_test.shape)

Data ready: (80372, 4) (20196, 4)


In [7]:
print(X_train.isna().sum())
print(X_test.isna().sum())

delivery_delay_days       2490
price                      656
freight_value              656
seller_historical_risk       0
dtype: int64
delivery_delay_days       378
price                     103
freight_value             103
seller_historical_risk      0
dtype: int64


In [9]:
# -----------------------------------
# Final Feature Matrix (SAFE COPY)
# -----------------------------------

feature_cols = [
    "delivery_delay_days",
    "price",
    "freight_value",
    "seller_historical_risk"
]

X_train = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

y_train = train_df["low_rating"]
y_test = test_df["low_rating"]

# -----------------------------------
# Impute Missing Values (TRAIN stats only)
# -----------------------------------

# Delivery delay
delay_median = X_train["delivery_delay_days"].median()
X_train["delivery_delay_days"] = X_train["delivery_delay_days"].fillna(delay_median)
X_test["delivery_delay_days"] = X_test["delivery_delay_days"].fillna(delay_median)

# Price
price_median = X_train["price"].median()
X_train["price"] = X_train["price"].fillna(price_median)
X_test["price"] = X_test["price"].fillna(price_median)

# Freight
freight_median = X_train["freight_value"].median()
X_train["freight_value"] = X_train["freight_value"].fillna(freight_median)
X_test["freight_value"] = X_test["freight_value"].fillna(freight_median)

# -----------------------------------
# Final Check
# -----------------------------------

print("Train missing total:", X_train.isna().sum().sum())
print("Test missing total:", X_test.isna().sum().sum())
print("Shapes:", X_train.shape, X_test.shape)

Train missing total: 0
Test missing total: 0
Shapes: (80372, 4) (20196, 4)


In [10]:
log_model = LogisticRegression(max_iter=1000)

log_model.fit(X_train, y_train)

log_pred = log_model.predict(X_test)
log_prob = log_model.predict_proba(X_test)[:, 1]

print("=== Logistic Regression ===")
print("ROC-AUC:", roc_auc_score(y_test, log_prob))
print("Recall:", recall_score(y_test, log_pred))
print("Precision:", precision_score(y_test, log_pred))
print("F1:", f1_score(y_test, log_pred))

=== Logistic Regression ===
ROC-AUC: 0.5846200913161361
Recall: 0.06577239290350498
Precision: 0.3392857142857143
F1: 0.11018484958318231


In [11]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:, 1]

print("=== Random Forest ===")
print("ROC-AUC:", roc_auc_score(y_test, rf_prob))
print("Recall:", recall_score(y_test, rf_pred))
print("Precision:", precision_score(y_test, rf_pred))
print("F1:", f1_score(y_test, rf_pred))

=== Random Forest ===
ROC-AUC: 0.6471283249018593
Recall: 0.4565123323236694
Precision: 0.21057884231536927
F1: 0.2882119928971452


In [12]:
import numpy as np

threshold = 0.35
rf_custom_pred = (rf_prob >= threshold).astype(int)

print("=== Random Forest (Threshold 0.35) ===")
print("Recall:", recall_score(y_test, rf_custom_pred))
print("Precision:", precision_score(y_test, rf_custom_pred))
print("F1:", f1_score(y_test, rf_custom_pred))

=== Random Forest (Threshold 0.35) ===
Recall: 0.694937256598875
Precision: 0.14786852039407053
F1: 0.24385059216519892


Model Evaluation Summary

The Logistic Regression baseline showed limited performance (ROC-AUC ≈ 0.58) and very low recall (~6%), making it unsuitable for business deployment.
The Random Forest model significantly improved performance:
ROC-AUC ≈ 0.65
Recall ≈ 46%
Precision ≈ 21%
F1 ≈ 0.29

Lowering the classification threshold to 0.35 increased recall to ~69% but reduced precision substantially (~15%), leading to a lower overall F1 score.
Given the trade-off between recall and operational noise, the default threshold (0.5) provides a more balanced and practical deployment setting.
The Random Forest model was selected as the final model due to its improved discrimination and better recall of low-rating orders.