# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# Data Preparation

In [None]:
df = pd.read_csv("onlineretail.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Drop all rows where customerId is null
df = df[~df["CustomerID"].isnull()]

In [None]:
# Drop records where Quantity or UnitPrice is zero
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

In [None]:
# Create a total price column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Exploratory Data Analysis

In [None]:
# Top Countries by Number of Customers
top_countries = df.groupby('Country')['CustomerID'].nunique().sort_values(ascending=False)[:10]
top_countries

In [None]:
# Visualise Top Countries by Number of Customers using horizontal barplot
plt.subplots(figsize=(15,6))
sns.barplot(x=top_countries.values, y=top_countries.index)
plt.title("Top 10 Countries by Unique Customers")
plt.xlabel("Number of Customers")
for i,v in enumerate(top_countries.values):
    pass
plt.show()

In [None]:
# Monthly Sales Trend
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M')
monthly_sales = df.groupby('InvoiceMonth')['TotalPrice'].sum()
plt.subplots(figsize=(15,6))
monthly_sales.plot(kind='line', title='Monthly Revenue')
plt.ylabel('Revenue')
plt.xlabel('Month')
plt.show()

In [None]:
# Top 10 Best-Selling Products

top_products = df.groupby(df["Description"])["Quantity"].sum().sort_values(ascending=False).head(10)
top_products

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=top_products.values, y=top_products.index)
plt.title("Top 10 Best-Selling Products")
plt.xlabel("Quantity Sold")
plt.show()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=df["TotalPrice"])
plt.title("Boxplot of Total Price")
plt.xlabel("TotalPrice")
plt.show()

In [None]:
# Remove outliers
lower_bound = df['TotalPrice'].quantile(0.10)  # 10th percentile
upper_bound = df['TotalPrice'].quantile(0.90)  # 90th percentile


In [None]:
# Check the distribution of UnitPrice
sns.boxplot(x=df[]);