In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime

# Initialize Faker
fake = Faker()

# Generate dataset
np.random.seed(42)
num_records = 1000

data = {
    "OrderID": np.arange(1, num_records + 1),
    "CustomerName": [fake.name() for _ in range(num_records)],
    "CustomerRegion": np.random.choice(["North", "South", "East", "West"], size=num_records),
    "Category": np.random.choice(["Electronics", "Clothing", "Home Appliances", "Accessories"], size=num_records),
    "Product": np.random.choice(
        ["Laptop", "Phone", "Monitor", "TV", "Blender", "Sofa", "T-Shirt", "Watch"], size=num_records
    ),
    "Price": np.random.randint(100, 1000, size=num_records),
    "Quantity": np.random.randint(1, 10, size=num_records),
    "OrderDate": pd.date_range(start="2022-01-01", periods=num_records).to_list(),
    "CustomerSatisfaction": np.random.randint(1, 6, size=num_records),  # Satisfaction scores between 1-5
}

df = pd.DataFrame(data)
df["TotalRevenue"] = df["Price"] * df["Quantity"]
df.head()



  from pandas.core import (


Unnamed: 0,OrderID,CustomerName,CustomerRegion,Category,Product,Price,Quantity,OrderDate,CustomerSatisfaction,TotalRevenue
0,1,Michael Lang,East,Clothing,T-Shirt,915,5,2022-01-01,4,4575
1,2,Tracy Lee,West,Home Appliances,TV,549,6,2022-01-02,4,3294
2,3,Shawn Lutz,North,Electronics,Sofa,553,4,2022-01-03,1,2212
3,4,Tonya Price,East,Electronics,Watch,728,9,2022-01-04,3,6552
4,5,Cynthia Montgomery DDS,East,Electronics,Watch,320,8,2022-01-05,5,2560


In [2]:
# Calculate the total revenue generated by each product.
ProductRevenue=df.groupby("Product")["TotalRevenue"].sum()
ProductRevenue

Product
Blender    360064
Laptop     421359
Monitor    347366
Phone      332127
Sofa       323841
T-Shirt    413249
TV         312907
Watch      280512
Name: TotalRevenue, dtype: int32

In [3]:
# Identify the region with the highest total revenue for "Clothing."
clothings=df[df["Category"]=="Clothing"]
highestregion=clothings.groupby("CustomerRegion")["TotalRevenue"].sum()
highestregion.sort_values(ascending=False).head(1)

CustomerRegion
North    203861
Name: TotalRevenue, dtype: int32

In [4]:
# Find the top 5 products with the highest total revenue.
# Calculate the total revenue generated by each product.
ProductRevenue=df.groupby("Product")["TotalRevenue"].sum()
ProductRevenue.sort_values(ascending=False).head(5)

Product
Laptop     421359
T-Shirt    413249
Blender    360064
Monitor    347366
Phone      332127
Name: TotalRevenue, dtype: int32

In [5]:
# List all orders for customers in the "North" region who purchased "Electronics."
NorthElectronics=df[(df["CustomerRegion"]=="North")&(df["Category"]=="Electronics")]
NorthElectronics

Unnamed: 0,OrderID,CustomerName,CustomerRegion,Category,Product,Price,Quantity,OrderDate,CustomerSatisfaction,TotalRevenue
2,3,Shawn Lutz,North,Electronics,Sofa,553,4,2022-01-03,1,2212
7,8,Alexander Haynes,North,Electronics,T-Shirt,111,9,2022-01-08,1,999
21,22,Austin Hunter,North,Electronics,Monitor,526,5,2022-01-22,3,2630
38,39,Jesse Matthews,North,Electronics,Sofa,569,7,2022-02-08,4,3983
63,64,John Palmer,North,Electronics,Watch,558,7,2022-03-05,3,3906
74,75,Cole Murphy,North,Electronics,T-Shirt,397,9,2022-03-16,3,3573
82,83,Adam Green,North,Electronics,Blender,271,9,2022-03-24,4,2439
139,140,Denise Wood,North,Electronics,Phone,749,2,2022-05-20,2,1498
141,142,Sherri Murphy,North,Electronics,Phone,176,6,2022-05-22,3,1056
161,162,Andrea Martinez,North,Electronics,Blender,630,4,2022-06-11,3,2520


In [6]:
# Calculate the average CustomerSatisfaction score for each CustomerRegion and sort the results.
Cust_Sat_Region=df.groupby("CustomerRegion")["CustomerSatisfaction"].mean()
Cust_Sat_Region.sort_values(ascending=False)

CustomerRegion
East     3.012931
South    3.008696
North    2.953488
West     2.917857
Name: CustomerSatisfaction, dtype: float64

In [7]:
# Determine the product in each Category with the highest total revenue.
Catrevenue=df.groupby(["Product","Category"])["TotalRevenue"].sum().reset_index()
Catrevenue1=Catrevenue.loc[Catrevenue.groupby("Category")["TotalRevenue"].idxmax()]
Catrevenue1

Unnamed: 0,Product,Category,TotalRevenue
0,Blender,Accessories,93174
21,T-Shirt,Clothing,120533
6,Laptop,Electronics,120588
7,Laptop,Home Appliances,105949


In [8]:
# Find the percentage contribution of each region to the total revenue.
regionsale=df.groupby("CustomerRegion")["TotalRevenue"].sum()
Total=df["TotalRevenue"].sum()
# print(Total)
regionsaleper=(regionsale/Total)*100
regionsaleper

CustomerRegion
East     24.009099
North    25.621609
South    24.304647
West     26.064644
Name: TotalRevenue, dtype: float64

In [9]:
# For each Category, calculate the median revenue per order.
cat_median= df.groupby("Category")["TotalRevenue"].median()
cat_median

Category
Accessories        2520.0
Clothing           2172.0
Electronics        2184.0
Home Appliances    2392.0
Name: TotalRevenue, dtype: float64

In [10]:
# Create a pivot table to show the total revenue for each product across regions.
newpivot=df.pivot_table(values="TotalRevenue",
                        index="Product",
                       columns="CustomerRegion",
                       aggfunc="sum")
newpivot

CustomerRegion,East,North,South,West
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blender,96455,80372,82401,100836
Laptop,79703,117787,96388,127481
Monitor,82071,85719,107607,71969
Phone,66214,102979,53906,109028
Sofa,86280,58442,95603,83516
T-Shirt,94352,133609,99817,85471
TV,102352,56954,59879,93722
Watch,62769,79346,82845,55552


In [11]:
# Create a pivot table showing the average CustomerSatisfaction for each category and region. Date-Based Analysis

pivot2=df.pivot_table(values="CustomerSatisfaction",
                      index="Category",
                     columns="CustomerRegion",
                     aggfunc="mean")

pivot2

CustomerRegion,East,North,South,West
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accessories,2.833333,3.108696,2.93617,2.842857
Clothing,3.080645,2.813333,2.916667,3.075758
Electronics,3.014706,2.839286,3.164384,2.928571
Home Appliances,3.092593,3.074074,2.96,2.837838


In [12]:
# Calculate the total revenue generated in each month of 2022.
df["Month"]=df["OrderDate"].dt.month
df["Year"]=df["OrderDate"].dt.year
df.head(5)

Unnamed: 0,OrderID,CustomerName,CustomerRegion,Category,Product,Price,Quantity,OrderDate,CustomerSatisfaction,TotalRevenue,Month,Year
0,1,Michael Lang,East,Clothing,T-Shirt,915,5,2022-01-01,4,4575,1,2022
1,2,Tracy Lee,West,Home Appliances,TV,549,6,2022-01-02,4,3294,1,2022
2,3,Shawn Lutz,North,Electronics,Sofa,553,4,2022-01-03,1,2212,1,2022
3,4,Tonya Price,East,Electronics,Watch,728,9,2022-01-04,3,6552,1,2022
4,5,Cynthia Montgomery DDS,East,Electronics,Watch,320,8,2022-01-05,5,2560,1,2022


In [13]:
Year2022=df[df["Year"]==2022]
monthrevenue2022=Year2022.groupby("Month")["TotalRevenue"].sum()
monthrevenue2022

Month
1     91974
2     85553
3     81507
4     74519
5     88395
6     99320
7     94364
8     94539
9     88290
10    80525
11    81367
12    90036
Name: TotalRevenue, dtype: int32

In [14]:
# Identify the product with the highest total revenue in the second quarter of 2022.
Quater2022=Year2022[Year2022["Month"].isin([4,5,6])]
highesttotalrevenueQ2=Quater2022.groupby("Product")["TotalRevenue"].sum()
highesttotalrevenueQ2.sort_values(ascending=False).head(1)

Product
Laptop    47466
Name: TotalRevenue, dtype: int32

In [15]:
# Rank all products by their total revenue and display the top 10.
total=df.groupby("Product")["TotalRevenue"].sum().reset_index()
total["Rank"]=total["TotalRevenue"].rank()
total

Unnamed: 0,Product,TotalRevenue,Rank
0,Blender,360064,6.0
1,Laptop,421359,8.0
2,Monitor,347366,5.0
3,Phone,332127,4.0
4,Sofa,323841,3.0
5,T-Shirt,413249,7.0
6,TV,312907,2.0
7,Watch,280512,1.0


In [16]:
# Within each category, rank products by their average CustomerSatisfaction score.
pi=df.groupby(["Product","Category"])["CustomerSatisfaction"].mean().reset_index()
pi["rank"]=pi.groupby("Category")["CustomerSatisfaction"].rank()
pi=pi.sort_values(["Category","rank"])
pi

Unnamed: 0,Product,Category,CustomerSatisfaction,rank
20,T-Shirt,Accessories,2.363636,1.0
28,Watch,Accessories,2.608696,2.0
4,Laptop,Accessories,2.689655,3.0
0,Blender,Accessories,2.806452,4.0
8,Monitor,Accessories,2.814815,5.0
12,Phone,Accessories,3.071429,6.0
16,Sofa,Accessories,3.346154,7.0
24,TV,Accessories,3.6,8.0
17,Sofa,Clothing,2.606061,1.0
29,Watch,Clothing,2.733333,2.0


In [19]:
# Find customers who placed more than 1 orders and calculate their total revenue.
NoOfOrder=df.groupby("CustomerName").agg(NoOfOrder=("OrderID","nunique"),TotalRevenue=("TotalRevenue","sum")).reset_index()
NoOfOrdermore3=NoOfOrder[NoOfOrder["NoOfOrder"]>1]
NoOfOrdermore3

Unnamed: 0,CustomerName,NoOfOrder,TotalRevenue
240,David Brown,2,1449
294,Elizabeth Williams,2,5872
480,Joshua Davis,2,11202
543,Kenneth Taylor,2,4172
913,Taylor White,2,10812


In [22]:
# Identify the category with the highest average revenue per order.
hightest=df.groupby("Category").agg(TotalRevenue=("TotalRevenue","sum"),TotalOrder=("OrderID","nunique")).reset_index()
hightest["Average"]=hightest["TotalRevenue"]/hightest["TotalOrder"]
hightest.sort_values("Average",ascending=False).head(1)

Unnamed: 0,Category,TotalRevenue,TotalOrder,Average
0,Accessories,628335,211,2977.890995


In [23]:
# Add a column to calculate the profit margin for each product, assuming a 30% profit margin on the price.
df["ProfitMargin"]=df["Price"]*0.30
df

Unnamed: 0,OrderID,CustomerName,CustomerRegion,Category,Product,Price,Quantity,OrderDate,CustomerSatisfaction,TotalRevenue,Month,Year,ProfitMargin
0,1,Michael Lang,East,Clothing,T-Shirt,915,5,2022-01-01,4,4575,1,2022,274.5
1,2,Tracy Lee,West,Home Appliances,TV,549,6,2022-01-02,4,3294,1,2022,164.7
2,3,Shawn Lutz,North,Electronics,Sofa,553,4,2022-01-03,1,2212,1,2022,165.9
3,4,Tonya Price,East,Electronics,Watch,728,9,2022-01-04,3,6552,1,2022,218.4
4,5,Cynthia Montgomery DDS,East,Electronics,Watch,320,8,2022-01-05,5,2560,1,2022,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Alicia Kelly,North,Electronics,Laptop,983,9,2024-09-22,1,8847,9,2024,294.9
996,997,David Brown,North,Clothing,Monitor,313,1,2024-09-23,4,313,9,2024,93.9
997,998,Michael Martinez,West,Electronics,Blender,754,6,2024-09-24,5,4524,9,2024,226.2
998,999,Stacey Hines DVM,West,Home Appliances,Phone,475,9,2024-09-25,5,4275,9,2024,142.5


In [46]:
# Determine the product that has the highest average revenue per order, grouped by region.
hightest1=df.groupby(["Category","CustomerRegion"]).agg(
    TotalRevenue=("TotalRevenue","sum"),
    TotalOrder=("OrderID","nunique")
).reset_index()


hightest1["Average"]=hightest1["TotalRevenue"]/hightest1["TotalOrder"]

hightest1_idx=hightest1.groupby("CustomerRegion")["Average"].idxmax()


hightest1_result = hightest1.iloc[hightest1_idx]

hightest1_result

Unnamed: 0,Category,CustomerRegion,TotalRevenue,TotalOrder,Average
0,Accessories,East,152252,48,3171.916667
1,Accessories,North,134315,46,2919.891304
14,Home Appliances,South,153596,50,3071.92
3,Accessories,West,202463,70,2892.328571


In [48]:
# Find the most frequently ordered product for each region.

data=df.groupby(["Product","CustomerRegion"])["OrderID"].nunique().reset_index()
Highest_indx=data.groupby("CustomerRegion")["OrderID"].idxmax()
Highest_product_region=data.iloc[Highest_indx]
Highest_product_region

Unnamed: 0,Product,CustomerRegion,OrderID
0,Blender,East,37
5,Laptop,North,40
18,Sofa,South,35
27,TV,West,43


In [None]:
# Calculate the rolling 7-day average revenue for the entire dataset.
df["RollingAvg"]=df["TotalRevenue"].rolling(window=7).mean()
df.head(10)