In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("online_retail.csv", encoding="ISO-8859-1")
df.head()


TypeError: 'tuple' object is not callable

2. Data Cleaning
Drop rows with missing CustomerID

In [2]:
df = df.dropna(subset=["CustomerID"])

# Convert InvoiceDate to datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

# Create TotalAmountSpent (Quantity * UnitPrice)
df["TotalAmountSpent"] = df["Quantity"] * df["UnitPrice"]

# Group by CustomerID to create the required features for segmentation
customer_df = df.groupby("CustomerID").agg({
    "TotalAmountSpent": "sum",
    "InvoiceDate": "max",
    "Quantity": "sum"
}).reset_index()


Calculating Average Purchase Value and handling cases where Quantity is zero

In [3]:
customer_df["AveragePurchaseValue"] = np.where(customer_df["Quantity"] > 0,
                                               customer_df["TotalAmountSpent"] /
                                               customer_df["Quantity"],
                                               0)

# Convert InvoiceDate to the number of days since last purchase
current_date = pd.to_datetime("today")
customer_df["DaysSinceLastPurchase"] = (
    current_date - customer_df["InvoiceDate"]).dt.days

# Handle missing values by imputing the mean (you can also use median if preferred)
customer_df["AveragePurchaseValue"].fillna(
    customer_df["AveragePurchaseValue"].mean(), inplace=True)

# Replace any infinite values that might have occurred
customer_df.replace([np.inf, -np.inf], np.nan, inplace=True)
customer_df.dropna(subset=["AveragePurchaseValue"], inplace=True)


3. Descriptive Statistics
Calculating basic statistics

In [4]:
mean_spent = customer_df["TotalAmountSpent"].mean()
median_spent = customer_df["TotalAmountSpent"].median()
std_spent = customer_df["TotalAmountSpent"].std()

mean_items = customer_df["Quantity"].mean()
median_items = customer_df["Quantity"].median()
std_items = customer_df["Quantity"].std()