In [2]:
# 1. Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
# 2. Load dataset

# Check current working directory
import os
print("Current working directory:", os.getcwd())

df = pd.read_csv(r"C:\Users\slych\OneDrive\Desktop\Hackathorn2\Online_Business_Analysis\data\clean_data\online_retail_cleaned.csv")

In [15]:
# 3. Map countries to regions
country_region_map = {
    'United Kingdom': 'Europe',
    'Germany': 'Europe',
    'France': 'Europe',
    'EIRE': 'Europe',
    'Spain': 'Europe',
    'Portugal': 'Europe',
    'Italy': 'Europe',
    'Belgium': 'Europe',
    'Switzerland': 'Europe',
    'Austria': 'Europe',
    'Netherlands': 'Europe',
    'Norway': 'Europe',
    'Sweden': 'Europe',
    'Denmark': 'Europe',
    'Finland': 'Europe',
    
    'USA': 'North America',
    'Canada': 'North America',

    'Australia': 'Oceania',
    'New Zealand': 'Oceania',

    'Japan': 'Asia',
    'Hong Kong': 'Asia',
    'Singapore': 'Asia',

    'United Arab Emirates': 'Middle East',
    'Israel': 'Middle East',

    # Default/fallback group
    'Unspecified': 'Other'
}

In [16]:
# 4. Add Region column
df["Region"] = df["Country"].map(country_region_map)
df["Region"] = df["Region"].fillna("Other")


In [17]:
# 5. Group by Region
region_sales = df.groupby("Region").agg(
    TotalSales=("TotalPrice", "sum"),
    NumTransactions=("InvoiceNo", "nunique"),
    TotalQuantity=("Quantity", "sum")
).reset_index()


In [18]:
# 6. Plotly: Bar chart of Total Sales by Region
fig_reg_sales = px.bar(
    region_sales.sort_values("TotalSales", ascending=False),
    x="TotalSales",
    y="Region",
    title="Total Sales by Region",
    orientation="h",
    color="TotalSales",
    color_continuous_scale="Cividis"
)
fig_reg_sales.show()

### 📊 Total Sales by Region – Bar Chart

Above is a horizontal bar chart showing the **total revenue** generated by each geographic region.

**Key Insights**
- **Europe dominates** overall sales, likely driven by high-volume purchases in the United Kingdom.  
- **Oceania and Asia** contribute moderate revenue, reflecting active but smaller customer bases.  
- Regions like the Middle East and North America show **lower sales**, presenting potential for market development.


In [23]:
# 7. Pie chart of regional share
fig_reg_pie = px.pie(
    region_sales,
    values="TotalSales",
    names="Region",
    title="Regional Distribution of Sales",
    hole=0.4
)
fig_reg_pie.show()

### 🥧 Regional Sales Distribution – Pie Chart

A donut-style pie chart representing each region’s **proportional share of total sales**.

**Key Insights**  
- Over **70% of total sales** originate from **Europe**, indicating a heavy market concentration.  
- **Oceania, Asia, and North America** contribute smaller slices, suggesting under-tapped potential.  
- Expansion into these less-represented regions could help **diversify revenue sources**.


In [None]:
# 8. Monthly trend for Top 3 Regions
top3_regions = region_sales.sort_values("TotalSales", ascending=False).head(3)["Region"].tolist()
df_top3 = df[df["Region"].isin(top3_regions)]

monthly_reg_sales = df_top3.groupby(["Year", "Month", "Region"])["TotalPrice"].sum().reset_index()
monthly_reg_sales["YearMonth"] = pd.to_datetime(monthly_reg_sales[["Year", "Month"]].assign(DAY=1))

fig_reg_trend = px.line(
    monthly_reg_sales,
    x="YearMonth",
    y="TotalPrice",
    color="Region",
    title="Monthly Sales Trend by Region (Top 3)",
    markers=True
)
fig_reg_trend.show()


### 📈 Monthly Sales Trend by Region – Line Chart

Above, a line chart showing **monthly total sales trends** for the top 3 regions over time.

**Key Insights**  
- **Europe shows steady sales**, with spikes around **November/December**, likely due to holiday shopping.  
- **Asia and Oceania** have more **volatile or seasonal patterns**, possibly reflecting smaller or niche markets.  
- Recognising these cycles may support **better inventory and marketing alignment**.
