<a href="https://colab.research.google.com/github/SohailVibeCoder/Olist-ML-project/blob/main/model_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data cleaning, aggregation and feature engineering

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from google.colab import drive
import os
import warnings

warnings.filterwarnings('ignore')

# ================================================================
# 1. DATA LOADING — Mount Drive and Import All Olist Raw Tables
#    (No transformation here, just getting all source data in.)
# ================================================================
drive.mount('/content/drive')
# Assuming your Olist files are in your Drive root folder
os.chdir("/content/drive/MyDrive/")

# Load all required original datasets (using variables defined by the user previously)
orders_df = pd.read_csv("olist_orders_dataset.csv")
order_reviews_df = pd.read_csv("olist_order_reviews_dataset.csv")
order_payments_df = pd.read_csv("olist_order_payments_dataset.csv")
customers_df = pd.read_csv("olist_customers_dataset.csv")
order_items_df = pd.read_csv("olist_order_items_dataset.csv")
products_df = pd.read_csv("olist_products_dataset.csv")
sellers_df = pd.read_csv("olist_sellers_dataset.csv")
translation_df = pd.read_csv("product_category_name_translation.csv")
geolocation_df = pd.read_csv("olist_geolocation_dataset.csv")

# =================================================================
# 2. ORDER-LEVEL AGGREGATION TO PREVENT DUPLICATION
# -----------------------------------------------------------------
# The raw order_items table has one row per (order, product).
# For modelling we want ONE ROW PER ORDER, so we:
#   - enrich items with product, translation and seller metadata,
#   - aggregate item-level metrics (counts, sums, medians, modes)
#     back to order_id level,
#   - aggregate payments to order_id level as well.
# This gives us clean order-level features like total_items,
# total_product_price, seller_count, etc., without duplicate labels.
# =================================================================

# 1. Prepare Item Data by joining with Product, Translation, AND SELLERS tables
items_enriched = order_items_df.merge(products_df, on='product_id', how='left')
items_enriched = items_enriched.merge(translation_df, on='product_category_name', how='left')
# Merge sellers_df to get seller_zip_code_prefix and other seller attributes
items_enriched = items_enriched.merge(sellers_df, on='seller_id', how='left')

# Ensure shipping_limit_date is datetime before aggregation
items_enriched['shipping_limit_date'] = pd.to_datetime(items_enriched['shipping_limit_date'], errors='coerce')

# 2. Aggregate the item details back to a single row per Order ID
items_agg = items_enriched.groupby('order_id').agg(
    total_items=('order_item_id', 'count'),
    total_freight=('freight_value', 'sum'),
    total_product_price=('price', 'sum'),
    seller_count=('seller_id', 'nunique'),
    main_product_category=('product_category_name_english', lambda x: x.mode()[0] if not x.mode().empty else 'unknown'),
    # Use median for product physical features
    product_name_lenght=('product_name_lenght', 'median'),
    product_description_lenght=('product_description_lenght', 'median'),
    product_photos_qty=('product_photos_qty', 'median'),
    product_weight_g=('product_weight_g', 'median'),
    product_length_cm=('product_length_cm', 'median'),
    product_height_cm=('product_height_cm', 'median'),
    product_width_cm=('product_width_cm', 'median'),
    # Add shipping_limit_date
    shipping_limit_date=('shipping_limit_date', 'max'),
    # This column now exists due to the merge above:
    seller_zip_code_prefix=('seller_zip_code_prefix', lambda x: x.mode()[0] if not x.mode().empty else np.nan)
).reset_index()

# Aggregate payments to order-level financial features
pay_agg = order_payments_df.groupby('order_id').agg(
    payment_sequential=('payment_sequential', 'count'), # Count of payment types used
    payment_installments=('payment_installments', 'max'), # Max installments used
    payment_value=('payment_value', 'sum') # Total payment value
).reset_index()

# --- 4. PERFORM NON-DUPLICATING MERGE and CLEANING ---

full_df = (
    orders_df
    .merge(order_reviews_df, on='order_id', how='left')
    .merge(pay_agg, on='order_id', how='left') # merge aggregated payments
    .merge(customers_df, on='customer_id', how='left')
    .merge(items_agg, on='order_id', how='left')
)

print(f"Initial raw DataFrame row count (No Duplication): {len(full_df)}")

# Filter Delivered, Convert Dates, Drop Core Nulls
full_df = full_df[full_df['order_status'] == 'delivered'].copy()
date_cols = [
    'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
    'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_creation_date',
    'review_answer_timestamp' # Note: 'shipping_limit_date' is already a datetime from the aggregation step above
]
for col in date_cols:
    full_df[col] = pd.to_datetime(full_df[col], errors='coerce')

# Now we work with aggregated features instead of item-level ones
full_df.dropna(subset=['total_product_price', 'total_freight', 'review_score', 'main_product_category'], inplace=True)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    full_df['review_comment_title'].fillna('', inplace=True)
    full_df['review_comment_message'].fillna('', inplace=True)


# =================================================================
# 4. GEOLOCATION AGGREGATION AND MERGE
# -----------------------------------------------------------------
# The geolocation table is at ZIP prefix level. Here we:
#   - compute median latitude/longitude for each ZIP prefix,
#   - map those coords onto customers and sellers via their ZIP prefixes,
#   - drop orders where geo info is missing.
# This creates numeric spatial features that we will later turn into
# distance_km and geo-clusters.
# =================================================================

# Get median lat/lng for each zip code prefix
geo_median = geolocation_df.groupby('geolocation_zip_code_prefix').agg(
    geolocation_lat=('geolocation_lat', 'median'),
    geolocation_lng=('geolocation_lng', 'median')
).reset_index()

df = full_df.copy()

# Merge customer lat/lng
geo_customer = geo_median.rename(columns={'geolocation_zip_code_prefix': 'customer_zip_code_prefix', 'geolocation_lat': 'customer_lat', 'geolocation_lng': 'customer_lng'})
df = df.merge(geo_customer, on='customer_zip_code_prefix', how='left')

# Merge seller lat/lng
geo_seller = geo_median.rename(columns={'geolocation_zip_code_prefix': 'seller_zip_code_prefix', 'geolocation_lat': 'seller_lat', 'geolocation_lng': 'seller_lng'})
df = df.merge(geo_seller, on='seller_zip_code_prefix', how='left')

# Drop rows missing necessary geo-coordinates (necessary for downstream features)
df.dropna(subset=['customer_lat', 'customer_lng', 'seller_lat', 'seller_lng'], inplace=True)


# =================================================================
# 5. ADVANCED FEATURE ENGINEERING (Geo + RFM + Delivery + Volume)
# -----------------------------------------------------------------
# This section builds the main predictive signals:
#   - distance_km: great-circle distance between seller and customer
#   - customer_geo_cluster: K-Means cluster ID based on customer coords
#   - RFM metrics: Recency, Frequency, Monetary value per customer
#   - delivery_performance_days & actual_delivery_days:
#         how fast and how early/late an order was delivered
#   - product_volume_cm3: rough size proxy for logistics complexity
# These are domain-informed features that encode behaviour,
# geography, and logistics performance into numeric variables.
# =================================================================

# --- 5. Haversine Distance & K-Means Clustering ---
# Haversine Distance Function
def haversine(lat1, lon1, lat2, lon2):
    # Great-circle distance between two coordinate pairs (in km)
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

df['distance_km'] = haversine(
    df['customer_lat'], df['customer_lng'],
    df['seller_lat'], df['seller_lng']
)

# K-Means Clustering (K=10) on customer geo-coordinates
geo_coords = df[['customer_lat', 'customer_lng']].values
scaler = StandardScaler()
geo_coords_scaled = scaler.fit_transform(geo_coords)
K = 10
# Ignore the KMeans warnings about memory management
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    kmeans = KMeans(n_clusters=K, random_state=42, n_init='auto', max_iter=300, verbose=0)
    df['customer_geo_cluster'] = kmeans.fit_predict(geo_coords_scaled)
df['customer_geo_cluster'] = df['customer_geo_cluster'].astype(object)

# --- 6. Create RFM, Delivery, and Volume Features (as in your original code) ---
# A. RFM Features: Recency, Frequency, Monetary at customer level
snapshot_date = df['order_purchase_timestamp'].max() + pd.Timedelta(days=1)
rfm_df = df.groupby('customer_unique_id').agg(
    Recency=('order_purchase_timestamp', lambda x: (snapshot_date - x.max()).days),
    Frequency=('order_id', 'nunique'),
    Monetary=('payment_value', 'sum')
).reset_index()
df = df.merge(rfm_df, on='customer_unique_id', how='left')

# B. Delivery Metrics: actual vs estimated timing
df['delivery_time_delta'] = df['order_delivered_customer_date'] - df['order_purchase_timestamp']
df['actual_delivery_days'] = df['delivery_time_delta'].dt.days.fillna(df['delivery_time_delta'].dt.days.median())
df['delivery_performance_days'] = (
    df['order_estimated_delivery_date'] - df['order_delivered_customer_date']
).dt.days.fillna(0)

# C. Product Volume (Uses aggregated product physical metrics)
df['product_volume_cm3'] = (df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm'])
volume_median = df['product_volume_cm3'].median()
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df['product_volume_cm3'].fillna(volume_median, inplace=True)


# =================================================================
# 6. ONE-HOT ENCODING AND PRODUCT CATEGORY AGGREGATION
# -----------------------------------------------------------------
# main_product_category is high-cardinality and very granular.
# Here we:
#   - one-hot encode each detailed category,
#   - group them into broader thematic buckets (e.g. "Home_Kitchen_Comfort"),
#   - drop the original detailed dummies.
# This reduces dimensionality while preserving useful category signal.
# =================================================================

# Note: Since we are using the aggregated 'main_product_category', we don't need the full loop.
# We will use the same product category map for aggregation as before.
product_prefix = 'main_product_category_'
df_encoded = pd.get_dummies(df, columns=['main_product_category'], prefix='main_product_category', drop_first=False)

columns_to_drop_after_agg = []
category_map = {
    'Home_Kitchen_Comfort': ['air_conditioning', 'home_appliances', 'home_appliances_2', 'home_comfort_2', 'home_confort', 'housewares', 'kitchen_dining_laundry_garden_furniture', 'la_cuisine', 'small_appliances', 'small_appliances_home_oven_and_coffee'],
    'Fashion_Apparel': ['fashio_female_clothing', 'fashion_male_clothing', 'fashion_childrens_clothes', 'fashion_underwear_beach', 'fashion_shoes', 'fashion_sport', 'fashion_bags_accessories', 'luggage_accessories', 'watches_gifts'],
    'Electronics_Technology': ['audio', 'cine_photo', 'computers', 'computers_accessories', 'consoles_games', 'electronics', 'fixed_telephony', 'music', 'musical_instruments', 'tablets_printing_image', 'telephony'],
    'Tools_Construction': ['construction_tools_construction', 'construction_tools_lights', 'construction_tools_safety', 'costruction_tools_garden', 'costruction_tools_tools', 'garden_tools', 'home_construction'],
    'Furniture_Decor': ['bed_bath_table', 'furniture_bedroom', 'furniture_decor', 'furniture_living_room', 'furniture_mattress_and_upholstery', 'office_furniture'],
    'Books_Media_Toys': ['books_general_interest', 'books_imported', 'books_technical', 'cds_dvds_musicals', 'dvds_blu_ray', 'toys', 'cool_stuff'],
    'Health_Personal_Care': ['health_beauty', 'perfumery', 'baby', 'diapers_and_hygiene'],
    'Art_Hobbies': ['art', 'arts_and_craftmanship', 'stationery', 'party_supplies', 'christmas_supplies'],
    'Food_Drink': ['drinks', 'food', 'food_drink'],
    'Automotive': ['auto'],
    'Other_Services': ['flowers', 'industry_commerce_and_business', 'market_place', 'pet_shop', 'security_and_services', 'signaling_and_security']
}

for new_category, detailed_categories in category_map.items():
    original_cols = [product_prefix + cat for cat in detailed_categories]
    existing_cols = [col for col in original_cols if col in df_encoded.columns]
    if existing_cols:
        df_encoded[new_category] = df_encoded[existing_cols].any(axis=1).astype(int)
        columns_to_drop_after_agg.extend(existing_cols)

sports_leisure_col = product_prefix + 'sports_leisure'
if sports_leisure_col in df_encoded.columns:
    df_encoded.rename(columns={sports_leisure_col: 'Sports_Leisure'}, inplace=True)

df_encoded = df_encoded.drop(columns=columns_to_drop_after_agg, errors='ignore')
df_encoded = df_encoded.drop(columns=[col for col in df_encoded.columns if col.startswith('main_product_category_') and col not in category_map], errors='ignore')


# =================================================================
# 7. FINAL CLEAN-UP AND TARGET VARIABLE CREATION
# -----------------------------------------------------------------
# At this stage we:
#   - drop identifiers, raw text, and low-value intermediate columns,
#   - keep only engineered, numeric/categorical model features,
#   - create review_score_binary (1 = rating ≥4, 0 = rating ≤3),
#   - save the final ML-ready table to CSV.
# This dataset (V3) is the single source used by the XGBoost model.
# =================================================================

# --- 8. Final Drop of Irrelevant or Redundant Columns ---
columns_to_drop_final = [
    'order_id', 'customer_id', 'review_id', 'customer_unique_id',
    'order_status', 'review_comment_title', 'review_comment_message', 'customer_city', 'seller_city',
    'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
    'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_creation_date',
    'review_answer_timestamp',
    'shipping_limit_date', # this is now present and can be dropped
    'product_length_cm', 'product_height_cm', 'product_width_cm', 'customer_zip_code_prefix', 'seller_zip_code_prefix',
    'delivery_time_delta',
    # Drop raw coordinates (replaced by distance_km and clusters)
    'customer_lat', 'customer_lng', 'seller_lat', 'seller_lng',
    # Drop state columns (replaced by geo clustering)
    'customer_state', 'seller_state',
    'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g'
]

# FINAL step: create target variable and drop the raw score
final_ml_df = df_encoded.copy()
final_ml_df['review_score_binary'] = (final_ml_df['review_score'] >= 4).astype(int)

final_ml_df = final_ml_df.drop(columns=['review_score'] + columns_to_drop_final, errors='ignore')

final_ml_df.to_csv("ml_ready_feature_table_V3.csv", index=False)

print(f"\n✅ Feature engineering complete. Final corrected dataset saved as 'ml_ready_feature_table_V3.csv'. Shape: {final_ml_df.shape}")


Mounted at /content/drive
Initial raw DataFrame row count (No Duplication): 99992

✅ Feature engineering complete. Final corrected dataset saved as 'ml_ready_feature_table_V3.csv'. Shape: (95879, 28)


XGBoost Classifier

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# =================================================================
# SECTION 1: DATA LOADING AND SETUP
# -----------------------------------------------------------------
# Goal:
#   • Load the final feature-engineered dataset (V3) where
#       - one row = one order
#       - all features (RFM, delivery, geo, categories, etc.) are ready.
#   • Convert object columns to 'category' so XGBoost can handle them.
#   • Split into train/test sets with the same class balance.
#   • Compute class weights (scale_pos_weight) to handle imbalance
#     between positive and negative reviews during training.
# =================================================================

# Load the final, advanced feature dataset
df_ml = pd.read_csv("ml_ready_feature_table_V3.csv")
print("dataset loaded. Shape:", df_ml.shape)

# Convert object columns to 'category' for XGBoost's categorical support
for col in df_ml.select_dtypes(include='object').columns:
    df_ml[col] = df_ml[col].astype('category')

# Target variable: 1 = Good (≥4), 0 = Bad (<4)
y_class = df_ml['review_score_binary']
X = df_ml.drop(columns=['review_score_binary'])

# Train-test split with stratification to preserve class proportions
X_train, X_test, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Evaluation set for XGBoost's internal tracking (train + test)
eval_set = [(X_train, y_train_class), (X_test, y_test_class)]

# Compute class weight: how many good vs bad reviews in training
count_class_1 = y_train_class.value_counts()[1]
count_class_0 = y_train_class.value_counts()[0]
scale_pos_weight = count_class_1 / count_class_0
print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")


# =================================================================
# SECTION 2: MODEL TRAINING (XGBoost Classifier)
# -----------------------------------------------------------------
# Goal:
#   • Train a cost-sensitive XGBoost model that:
#       - handles class imbalance via scale_pos_weight
#       - uses logloss to optimise probability predictions
#       - works with categorical features directly.
#   • The model learns P(review is good) and P(review is bad) for
#     each order, which we will later convert into class labels using
#     a custom decision threshold.
# =================================================================

xgb_final_model = XGBClassifier(
    n_estimators=300,         # number of trees
    max_depth=5,             # tree depth (controls complexity)
    learning_rate=0.1,       # step size shrinkage
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,  # handle imbalance
    eval_metric="logloss",   # optimise log-loss (probability quality)
    enable_categorical=True  # allow categorical dtype in features
)

print("\nTraining XGBoost Model...")
xgb_final_model.fit(
    X_train,
    y_train_class,
    eval_set=eval_set,
    verbose=False
)
print("Model training complete.")


# =================================================================
# SECTION 3: THRESHOLD OPTIMISATION (F1 for BAD class)
# -----------------------------------------------------------------
# XGBoost gives probabilities, not just hard 0/1 labels.
# By default, you would predict:
#    class 1 if P(class 1) >= 0.5, else class 0.
#
# But here:
#   • We care specifically about detecting BAD reviews (class 0),
#     because they are more damaging.
#   • The data is imbalanced, so 0.5 is not necessarily the best cut-off.
#
# Strategy:
#   1. Use predict_proba() to get P(class=0) for each test sample.
#   2. Sweep through thresholds from 0.50 down to 0.01.
#   3. For each threshold:
#        - If P(class 0) > threshold → predict 0 (bad)
#          else → predict 1 (good)
#        - Compute F1-score for the BAD class only (pos_label=0).
#   4. Pick the threshold that maximises F1 for bad reviews.
#
# Business meaning:
#   We tune the decision boundary so the model is as effective as
#   possible at catching likely negative reviewers, balancing precision
#   and recall for that group.
# =================================================================

y_proba_test = xgb_final_model.predict_proba(X_test)
# Column 0 = probability of class 0 (bad review)
y_proba_minority_test = y_proba_test[:, 0]

thresholds = np.linspace(0.50, 0.01, 50)
best_f1, best_threshold = 0, 0

for threshold in thresholds:
    # Predict class 0 if P(bad) > threshold, else class 1 (good)
    y_pred_temp = np.where(y_proba_minority_test > threshold, 0, 1)

    # F1 for the BAD class (treat 0 as the "positive" class here)
    f1 = f1_score(y_test_class, y_pred_temp, pos_label=0)

    # Keep track of the best F1 and its threshold
    if f1 > best_f1:
        best_f1, best_threshold = f1, threshold

print(f"\nOptimal threshold (MAX F1-SCORE for bad class): {best_threshold:.3f} "
      f"(F1 for bad class = {best_f1:.3f})")


# =================================================================
# SECTION 4: FINAL EVAL – TEST + TRAIN (same threshold)
# -----------------------------------------------------------------
# Goal:
#   • Evaluate how the chosen threshold performs on:
#       - the TEST set (generalisation)
#       - the TRAINING set (fit/overfit comparison)
#   • We:
#       - generate predictions using the optimised threshold,
#       - print precision/recall/F1 for both classes,
#       - show the confusion matrices,
#       - compare macro F1 between train and test.
#
# If train >> test → likely overfitting.
# If train ≈ test → model generalises reasonably well.
# =================================================================

# ---------- TEST SET ----------
y_pred_test = np.where(y_proba_minority_test > best_threshold, 0, 1)

print("\n--- TEST SET PERFORMANCE (F1-OPTIMIZED) ---")
print(classification_report(y_test_class, y_pred_test, target_names=['0 (Bad)', '1 (Good)']))

cm_test = confusion_matrix(y_test_class, y_pred_test)
cm_test_df = pd.DataFrame(
    cm_test,
    index=['Actual Bad (0)', 'Actual Good (1)'],
    columns=['Predicted Bad (0)', 'Predicted Good (1)']
)

print("\nConfusion Matrix (Test Set):")
print(cm_test_df)

macro_f1_test = f1_score(y_test_class, y_pred_test, average='macro')
print(f"\nMacro F1 (Test): {macro_f1_test:.3f}")

# ---------- TRAIN SET ----------
y_proba_train = xgb_final_model.predict_proba(X_train)
y_proba_minority_train = y_proba_train[:, 0]
y_pred_train = np.where(y_proba_minority_train > best_threshold, 0, 1)

print("\n--- TRAINING SET PERFORMANCE (F1-OPTIMIZED, SAME THRESHOLD) ---")
print(classification_report(y_train_class, y_pred_train, target_names=['0 (Bad)', '1 (Good)']))

cm_train = confusion_matrix(y_train_class, y_pred_train)
cm_train_df = pd.DataFrame(
    cm_train,
    index=['Actual Bad (0)', 'Actual Good (1)'],
    columns=['Predicted Bad (0)', 'Predicted Good (1)']
)

print("\nConfusion Matrix (Train Set):")
print(cm_train_df)

macro_f1_train = f1_score(y_train_class, y_pred_train, average='macro')
print(f"\nMacro F1 (Train): {macro_f1_train:.3f}")


dataset loaded. Shape: (95879, 28)
Calculated scale_pos_weight: 3.74

Training XGBoost Model...
Model training complete.

Optimal threshold (MAX F1-SCORE for bad class): 0.080 (F1 for bad class = 0.453)

--- TEST SET PERFORMANCE (F1-OPTIMIZED) ---
              precision    recall  f1-score   support

     0 (Bad)       0.46      0.45      0.45      4042
    1 (Good)       0.85      0.86      0.86     15134

    accuracy                           0.77     19176
   macro avg       0.65      0.65      0.65     19176
weighted avg       0.77      0.77      0.77     19176


Confusion Matrix (Test Set):
                 Predicted Bad (0)  Predicted Good (1)
Actual Bad (0)                1816                2226
Actual Good (1)               2166               12968

Macro F1 (Test): 0.654

--- TRAINING SET PERFORMANCE (F1-OPTIMIZED, SAME THRESHOLD) ---
              precision    recall  f1-score   support

     0 (Bad)       0.53      0.53      0.53     16166
    1 (Good)       0.87      0.8

Pearsons coefficient correlation (absolute correlation for dissatisfaction)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- 1. Load the dataset ---
df_original = pd.read_csv("ml_ready_feature_table_V3.csv")

# --- 2. Define the binary satisfaction target ---
TARGET_COLUMN = 'review_score_binary'

# Remove the target score column from the feature correlation calculation.
df_corr = df_original.drop(columns=[TARGET_COLUMN], errors='ignore')

# Clean non-numeric columns for correlation calculation
for col in df_corr.columns:
    if df_corr[col].dtype == 'object' or df_corr[col].dtype.name == 'category':
        df_corr = df_corr.drop(columns=[col])
    elif df_corr[col].dtype == 'bool':
        df_corr[col] = df_corr[col].astype(int)

df_corr.fillna(0, inplace=True)
df_corr[TARGET_COLUMN] = df_original[TARGET_COLUMN]


# --- 3. Pearson correlations ---
corr_matrix = df_corr.corr()
target_corr = corr_matrix[TARGET_COLUMN].drop(TARGET_COLUMN)

# --- 4. Select only negative features (dissatisfaction drivers) ---
negative_corr = target_corr[target_corr < 0]
top5_negative = negative_corr.sort_values(ascending=True).head(5)


# --- 5. Apply Business-Friendly Naming and Prepare for Plotting ---
name_map = {
    'actual_delivery_days': 'Delivery: Total Shipping Days',
    'total_freight': 'Logistics: Total Freight Value',
    'seller_count': 'Seller Count',
    'total_items': 'Total Items',
    'Furniture_Decor': 'Product: Furniture/Decor',
    'distance_km': 'Logistics: Customer-Seller Distance (KM)',
    'payment_value': 'Total Payment Value',
    'payment_installments': 'Payment Installments',
}

# Apply mapping and keep the data sorted by magnitude (strongest influence first)
top5_plot = top5_negative.abs().sort_values(ascending=False)
top5_plot.index = top5_plot.index.map(lambda x: name_map.get(x, x))


# --- 6. Plot: clean, blue gradient, horizontal bar chart for readability ---
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))

colors = sns.color_palette("Blues_d", n_colors=len(top5_plot))

# Generate horizontal bar plot (better for long labels)
plt.barh(
    y=top5_plot.index,
    width=top5_plot.values,
    color=colors
)

# Reverse the order so the largest bar is at the top (standard visualization practice)
plt.gca().invert_yaxis()

plt.title(
    "Top 5 Drivers of Customer Dissatisfaction",
    fontsize=16,
    fontweight="bold",
    pad=15
)

plt.xlabel("Absolute Pearson Correlation Magnitude", fontsize=12)
plt.ylabel("Feature", fontsize=12)

plt.grid(axis='x', linestyle='--', alpha=0.4)
sns.despine(left=True, bottom=True)

plt.tight_layout()
plt.savefig("top_5_negative_pearson_V3_business_friendly.png", dpi=300)
plt.close()

print("\nSaved as 'top_5_negative_pearson_V3_business_friendly.png'")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the dataset ---
df_original = pd.read_csv("ml_ready_feature_table_V3.csv")

# --- 2. Define the binary satisfaction target ---
TARGET_COLUMN = 'review_score_binary'

# Remove the target score column from the feature correlation calculation.
df_corr = df_original.drop(columns=[TARGET_COLUMN], errors='ignore')

# Clean non-numeric columns for correlation calculation
for col in df_corr.columns:
    if df_corr[col].dtype == 'object' or df_corr[col].dtype.name == 'category':
        # Drop categorical columns entirely for Pearson correlation
        df_corr = df_corr.drop(columns=[col])
    elif df_corr[col].dtype == 'bool':
        df_corr[col] = df_corr[col].astype(int)

df_corr.fillna(0, inplace=True)

# Temporarily merge the target back for correlation calculation
df_corr[TARGET_COLUMN] = df_original[TARGET_COLUMN]


# --- 3. Pearson correlations ---
corr_matrix = df_corr.corr()
target_corr = corr_matrix[TARGET_COLUMN].drop(TARGET_COLUMN) # Correlation with all features

# --- 4. Select the Top 10 Negative Features (Dissatisfaction Drivers) ---
# MODIFICATION: Select ONLY negative values, and take the 10 most negative (lowest values)
negative_corr = target_corr[target_corr < 0]

# Top 10 most negative correlations (ascending order means most negative first)
top10_negative = negative_corr.sort_values(ascending=True).head(10)


# --- 5. Prepare for Plotting ---
# Convert to positive magnitude for plotting (to see bar lengths easily)
top10_plot = top10_negative.abs().sort_values(ascending=False)
top10_plot = top10_plot.rename(
    {name: f"({top10_negative[name]:.3f}) {name}" for name in top10_plot.index}
)


# --- 6. Plot: clean, blue gradient, no outlines, no numbers ---
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 7))

# Blue gradient
colors = sns.color_palette("Blues_d", n_colors=len(top10_plot))

plt.barh(
    y=top10_plot.index,
    width=top10_plot.values,
    color=colors
)

plt.title(
    "Top 10 Drivers of Dissatisfaction (Most Negative Pearson Correlation)",
    fontsize=16,
    fontweight="bold",
    pad=15
)

plt.xlabel("Correlation with dissatisfaction", fontsize=12)
plt.ylabel("Feature (with Correlation Value)", fontsize=12)

plt.grid(axis='x', linestyle='--', alpha=0.4)
sns.despine(left=True, bottom=True)

plt.tight_layout()
plt.savefig("top_10_negative_pearson_V3.png", dpi=300)
plt.close()

print("\nSaved as 'top_10_negative_pearson_V3.png'")