# Audit Anamoly Detection for E-commerce Orders

In [4]:
import pandas as pd

# Loading key Olist datasets
orders = pd.read_csv('olist_orders_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
customers = pd.read_csv('olist_customers_dataset.csv')
payments = pd.read_csv('olist_order_payments_dataset.csv')
reviews = pd.read_csv('olist_order_reviews_dataset.csv')

## Anamoly 1 - High Quantity Orders

In [5]:
# Counting items per order
order_item_counts = order_items.groupby('order_id')['order_item_id'].count().reset_index()
order_item_counts.columns = ['order_id', 'item_count']

# Flagging high quantity orders (e.g., >5 items)
high_quantity_orders = order_item_counts[order_item_counts['item_count'] > 5]

## Anamoly 2 - Suspicious Product Pricing

In [6]:
# Merging product and item info
item_prices = order_items.merge(products, on='product_id', how='left')

# Calculating median price per category
category_median = item_prices.groupby('product_category_name')['price'].median().reset_index()
category_median.columns = ['product_category_name', 'category_median_price']

# Joining back and flagging high-price items
price_check = item_prices.merge(category_median, on='product_category_name', how='left')
price_check['price_ratio'] = price_check['price'] / price_check['category_median_price']

# Flagfing items 3x above category median
suspicious_prices = price_check[price_check['price_ratio'] > 3]

## Anamoly 3 - Filtering bad reviews & Refund Triggers

In [7]:
# Filtering 1-star reviews
bad_reviews = reviews[reviews['review_score'] == 1]

# Counting per customer
orders_with_customers = orders.merge(customers, on='customer_id', how='left')
bad_review_info = bad_reviews.merge(orders_with_customers, on='order_id', how='left')

bad_review_counts = bad_review_info.groupby('customer_unique_id').size().reset_index(name='one_star_reviews')
frequent_complainers = bad_review_counts[bad_review_counts['one_star_reviews'] >= 3]

## Anamoly 4 - Over Refunding

In [8]:
# Join payments with order items to compare payment vs. total price
order_prices = order_items.groupby('order_id')['price'].sum().reset_index()
payments_total = payments.groupby('order_id')['payment_value'].sum().reset_index()

# Merge both
payment_check = order_prices.merge(payments_total, on='order_id', how='left')

# Flag orders with zero payment or payment > 2x total price
payment_check['price_to_payment_ratio'] = payment_check['payment_value'] / payment_check['price']
suspicious_payments = payment_check[(payment_check['payment_value'] == 0) | (payment_check['price_to_payment_ratio'] 
                                                                             > 2)]

## Anamoly 5 - Late Delivers 

In [10]:
# Filter reviews with score <= 2
bad_reviews = reviews[reviews['review_score'] <= 2]

# Merge with order delivery dates
delivery_check = orders.merge(bad_reviews, on='order_id', how='inner')

# Convert date columns to datetime
delivery_check['order_delivered_customer_date'] = pd.to_datetime(delivery_check['order_delivered_customer_date'])
delivery_check['order_estimated_delivery_date'] = pd.to_datetime(delivery_check['order_estimated_delivery_date'])

# Flag late deliveries
delivery_check['delivery_late'] = delivery_check['order_delivered_customer_date'] > delivery_check['order_estimated_delivery_date']
late_delivery_complaints = delivery_check[delivery_check['delivery_late'] == True]

In [11]:
# Exporting Anamoly checks into files to create a tableau dashboard
high_quantity_orders.to_csv('anomaly_high_quantity_orders.csv', index=False)
suspicious_prices.to_csv('anomaly_suspicious_prices.csv', index=False)
frequent_complainers.to_csv('anomaly_frequent_reviewers.csv', index=False)
suspicious_payments.to_csv('anomaly_suspicious_payments.csv', index=False)
late_delivery_complaints.to_csv('anomaly_late_deliveries.csv', index=False)