In [9]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv("customer_purchases.csv")

# 2. Apply discount function using apply() and lambda
df["Discounted_Price"] = df.apply(
    lambda x: x["Price"] * 0.9 if x["Category"] == "Electronics" 
    else (x["Price"] * 0.95 if x["Category"] == "Groceries" else x["Price"]), axis=1
)

# 3. Categorize regions into Urban/Rural using map()
region_map = {"New York": "Urban", "Los Angeles": "Urban", "Chicago": "Urban",
              "Texas": "Rural", "Iowa": "Rural", "Nebraska": "Rural"}
df["Region_Type"] = df["Region"].map(region_map)

# 4. Convert Date_of_Purchase to datetime
df["Date_of_Purchase"] = pd.to_datetime(df["Date_of_Purchase"])

# 5. Calculate total revenue per transaction
df["Total_Revenue"] = df.apply(lambda x: x["Discounted_Price"] * x["Quantity"], axis=1)

# 6. Group by Category
category_summary = df.groupby("Category").agg(
    Total_Revenue=("Total_Revenue", "sum"),
    Avg_Price=("Discounted_Price", "mean"),
    Total_Quantity=("Quantity", "sum")
)

# 7. Group by Customer_ID
customer_summary = df.groupby("Customer_ID").agg(
    Total_Spent=("Total_Revenue", "sum"),
    Total_Purchases=("Product", "count")
)

# 8. Pivot table for Region vs Category revenue
pivot_table = pd.pivot_table(df, values="Total_Revenue", index="Region", columns="Category", aggfunc="sum")

# 9. Sort customers by total spending (descending)
sorted_customers = customer_summary.sort_values(by="Total_Spent", ascending=False)




In [None]:
# 10. Sort products alphabetically
sorted_products = df.groupby("Product")[["Quantity", "Total_Revenue"]].sum().sort_index()

# 11. Identify top-selling product by quantity
top_product = df.groupby("Product")["Quantity"].sum().idxmax()

# 12. Find customers who spent more than average
avg_spent = customer_summary["Total_Spent"].mean()
high_spenders = customer_summary[customer_summary["Total_Spent"] > avg_spent]

# 13. Export final aggregated dataset
df.to_csv("final_aggregated_dataset.csv", index=False)

# Display results
print("Category Summary:\n", category_summary)
print("\nCustomer Summary:\n", customer_summary.head())
print("\nPivot Table (Region vs Category):\n", pivot_table)
print("\nTop-Selling Product:", top_product)
print("\nHigh Spenders:\n", high_spenders.head())

Category Summary:
              Total_Revenue   Avg_Price  Total_Quantity
Category                                              
Electronics       52920.00  462.788372             114
Groceries         62803.55  483.850000             132

Customer Summary:
              Total_Spent  Total_Purchases
Customer_ID                              
CUST001          4539.00                5
CUST002          8881.70                6
CUST003          4589.45                5
CUST004          5570.70                6
CUST005          4221.60                3

Pivot Table (Region vs Category):
 Category     Electronics  Groceries
Region                             
Chicago           9975.6   13098.60
Iowa              3319.2    8410.35
Los Angeles       2881.8    4910.55
Nebraska          8661.6   12596.05
New York         13536.0   14099.90
Texas            14545.8    9688.10

Top-Selling Product: Laptop

High Spenders:
              Total_Spent  Total_Purchases
Customer_ID                        