In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
sns.set(style='darkgrid')

In [0]:
orders = spark.sql("""select * from ecommerce.olist_orders_dataset""")
items = spark.sql("""select * from ecommerce.olist_order_items_dataset""")
reviews = spark.sql("""select * from ecommerce.olist_order_reviews_dataset""")
products = spark.sql("""select * from ecommerce.olist_products_dataset""")

In [0]:
order_reviews = orders.join(
    reviews.select('order_id', 'review_score'),
    on='order_id',
    how='left'
)
display(order_reviews)

In [0]:
item_summary = items.groupby('order_id').agg({
    'price': 'sum',
    'freight_value': 'sum',
    'product_id': 'count'
}).withColumnRenamed('sum(price)', 'total_price') \
  .withColumnRenamed('sum(freight_value)', 'freight') \
  .withColumnRenamed('count(product_id)', 'n_items')

display(item_summary)

In [0]:
order_reviews = order_reviews.join(
    item_summary.select('order_id', 'total_price', 'freight', 'n_items'),
    on='order_id',
    how='left'
)
display(order_reviews)

In [0]:
# Add product category (most frequent in order)
prod_cat = items.join(products.select('product_id', 'product_category_name'), on='product_id',
    how='left')
display(prod_cat)

In [0]:
%python
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Create a Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Define the UDF
def get_main_category(order_id):
    return main_cat.get(order_id, None)

# Register the UDF
get_main_category_udf = udf(get_main_category, StringType())

# Use withColumn to create the new column
order_reviews = order_reviews.withColumn('main_category', get_main_category_udf(order_reviews['order_id']))

# display(order_reviews)

In [0]:
%python
from pyspark.sql import SparkSession
import pandas as pd

# Ensure Spark session is active
spark = SparkSession.builder.getOrCreate()

# Select relevant columns and drop rows with missing values
df_bin = order_reviews.select(
    'order_id', 'review_score', 'total_price', 'freight', 'n_items', 'main_category'
).dropna()

# Create a new column 'positive_review' based on 'review_score'
df_bin = df_bin.withColumn('positive_review', (df_bin['review_score'] == 5).cast("integer"))

# Drop unnecessary columns
df_bin = df_bin.drop('review_score', 'order_id')

# Convert to Pandas DataFrame
df_bin_pd = df_bin.toPandas()

# Perform get_dummies operation on 'main_category' column
df_bin_pd = pd.get_dummies(df_bin_pd, columns=['main_category'], drop_first=True)

# Convert back to Spark DataFrame
df_bin = spark.createDataFrame(df_bin_pd)

# Display the DataFrame
display(df_bin)

In [0]:
# Multi-class classification dataset
def classify_score(score):
    if score == 5:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

order_reviews['review_score_class'] = order_reviews['review_score'].apply(classify_score)
agg_df = order_reviews.groupby('order_id').agg({
    'total_price': 'sum',
    'freight': 'sum',
    'main_category': lambda x: x.mode()[0] if not x.mode().empty else 'unknown',
    'review_score_class': 'first'
}).reset_index()
agg_df = pd.get_dummies(agg_df, columns=['main_category'], drop_first=True)
X_multi = agg_df.drop(columns=['order_id', 'review_score_class'])
y_multi = agg_df['review_score_class'].map({'positive': 2, 'neutral': 1, 'negative': 0})

In [0]:
print("===== Binary Class Distribution =====")
print(df_bin['positive_review'].value_counts(normalize=True))

In [0]:
# Correlation heatmap for numeric features
plt.figure(figsize=(12, 6))
corr = df_bin.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap of Binary Dataset Features")
plt.show()

In [0]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

# Sample a subset of the data if it's too large
sample_df = df_bin.sample(500, random_state=42)

# Select numerical columns for the scatter matrix
numerical_cols = sample_df.select_dtypes(include='number').columns

# Create the scatter matrix
scatter_matrix(sample_df[numerical_cols], figsize=(10, 10), diagonal='kde')

plt.suptitle('Scatter Matrix of Sample Features (Binary)', y=1.02)
plt.show()

In [0]:
# Visualize distribution of numeric features by class
features_to_plot = ['total_price', 'freight', 'n_items']

for col in features_to_plot:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df_bin, x='positive_review', y=col)
    plt.title(f'{col} vs Positive Review')
    plt.show()

In [0]:
# Binary Classification
X_bin = df_bin.drop(columns='positive_review')
y_bin = df_bin['positive_review']
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_bin, y_bin, stratify=y_bin, random_state=42)

clf_bin = RandomForestClassifier(n_estimators=100, random_state=42)
clf_bin.fit(X_train_bin, y_train_bin)

In [0]:
# Multi-class Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, stratify=y_multi, random_state=42)

clf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
clf_multi.fit(X_train_multi, y_train_multi)

In [0]:
# Binary
print("===== Binary Classification Report =====")
print(classification_report(y_test_bin, clf_bin.predict(X_test_bin)))

In [0]:
# Binary
print("===== Binary Confusion Matrix =====")
print(confusion_matrix(y_test_bin, clf_bin.predict(X_test_bin)))

In [0]:
# Multi-class
print("===== Multi-class Classification Report =====")
print(classification_report(y_test_multi, clf_multi.predict(X_test_multi)))

In [0]:
# Multi-class
print("===== Multi-class Confusion Matrix =====")
print(confusion_matrix(y_test_multi, clf_multi.predict(X_test_multi)))

In [0]:
# Binary Feature Importances
perm_bin = permutation_importance(clf_bin, X_test_bin, y_test_bin, n_repeats=10, random_state=42)
importances_bin = pd.DataFrame({
    'feature': X_bin.columns,
    'importance': perm_bin.importances_mean
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(data=importances_bin.head(10), x='importance', y='feature', palette='viridis')
plt.title('Top 10 Feature Importances (Binary)')
plt.show()

Key Findings & Conclusions
The models can reasonably predict customer satisfaction based on order characteristics.

Features like freight value, total price, and number of items are often influential.

Certain product categories might have a higher influence on the likelihood of receiving a positive or negative review.

The dataset is imbalanced, with more positive reviews than neutral or negative, which affects model performance (common in real-world customer feedback datasets).