> # <p style="background-color:#FFC0CB; font-family:newtimeroman;color:#FF29ED;font-size:220%; text-align:center; border-radius: 15px 55px;"> Business| EDA| DEPI</p>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df5=pd.read_csv('/kaggle/input/cleaned/cleaned_data final.csv')

# <p style="background-color:#FFC0CB; font-family:newtimeroman;color:#FF29ED; font-size:150%; text-align:center; border-radius: 15px 50px;"> ⚛ Dataset Analysis ⚛</p>

## 1. Sales Performance

In [None]:
# Total sales for each product category and sub-category
sales_by_category = df5.groupby(['Category', 'Sub-Category'])['Sales'].sum().reset_index()
sales_by_category = sales_by_category.sort_values(by='Sales', ascending=False)
sales_by_category

In [None]:
# Sales by Region
sales_by_region = df5.groupby('Region')['Sales'].sum().reset_index()
sales_by_region

In [None]:
# Sales by Shipping Mode
sales_by_ship_mode = df5.groupby('Ship Mode')['Sales'].sum().reset_index()
sales_by_ship_mode

## 2. Customer Behavior

In [None]:
# Top 10 customers by total sales
top_customers = df5.groupby('Customer ID')['Sales'].sum().sort_values(ascending=False).head(10)
top_customers

In [None]:
# Unique customers in each region
customers_by_region = df5.groupby('Region')['Customer ID'].nunique().reset_index()
customers_by_region

## 3. Order Analysis

In [None]:
# Average order quantity across product categories
avg_quantity_category = df5.groupby('Category')['Quantity'].mean().reset_index()
avg_quantity_category

In [None]:
# Order date distribution
df5['Order Date'] = pd.to_datetime(df5['Order Date'], errors='coerce')
df5['Order Year'] = df5['Order Date'].dt.year

plt.figure(figsize=(10,6))
sns.countplot(data=df5, x='Order Year')
plt.title('Orders by Year')
plt.show()

## 4. Profitability and Discounts

In [None]:
# Average discount across regions
avg_discount_region = df5.groupby('Region')['Discount'].mean().reset_index()
avg_discount_region

In [None]:
# Discounts and sales relationship
plt.figure(figsize=(10,6))
sns.scatterplot(data=df5, x='Discount', y='Sales')
plt.title('Discount vs Sales')
plt.show()

## 5. Shipping Modes

In [None]:
# Shipping modes by region
ship_mode_region = df5.groupby(['Region', 'Ship Mode']).size().unstack().fillna(0)
ship_mode_region

## 6. Product Performance

7.Sales Trend Over Time (Yearly and Monthly)

In [None]:
# Extract year and month from Order Date
df5['Order Date'] = pd.to_datetime(df5['Order Date'], errors='coerce')
df5['Year'] = df5['Order Date'].dt.year
df5['Month'] = df5['Order Date'].dt.month

# Group sales by year
sales_by_year = df5.groupby('Year')['Sales'].sum().reset_index()

# Plot sales trend over years
plt.figure(figsize=(10,6))
sns.lineplot(data=sales_by_year, x='Year', y='Sales')
plt.title('Sales Trend Over the Years')
plt.show()

# Group sales by month
sales_by_month = df5.groupby('Month')['Sales'].sum().reset_index()

# Plot sales trend by month
plt.figure(figsize=(10,6))
sns.barplot(data=sales_by_month, x='Month', y='Sales')
plt.title('Sales by Month')
plt.show()


8.Top Product Categories by Region

In [None]:
# Group sales by Category and Region
sales_by_category_region = df5.groupby(['Category', 'Region'])['Sales'].sum().reset_index()

# Plot top product categories in each region
plt.figure(figsize=(12,6))
sns.barplot(data=sales_by_category_region, x='Region', y='Sales', hue='Category')
plt.title('Top Product Categories by Region')
plt.show()


9. Customer Purchase Patterns Across Regions

In [None]:
# Group quantity by Region
quantity_by_region = df5.groupby('Region')['Quantity'].sum().reset_index()

# Plot quantity distribution across regions
plt.figure(figsize=(10,6))
sns.barplot(data=quantity_by_region, x='Region', y='Quantity')
plt.title('Quantity Purchased by Region')
plt.show()


10. Discount Effectiveness on Sales

In [None]:
# Scatter plot for discount vs sales
plt.figure(figsize=(10,6))
sns.scatterplot(data=df5, x='Discount', y='Sales')
plt.title('Discount vs Sales')
plt.show()

# Correlation between discount and sales
correlation = df5[['Discount', 'Sales']].corr()
correlation


In [None]:
# Scatter plot for discount vs sales
plt.figure(figsize=(10,6))
sns.scatterplot(data=df5, x='Discount', y='Sales')
plt.title('Discount vs Sales')
plt.show()

# Correlation between discount and sales
correlation = df5[['Discount', 'Sales']].corr()
correlation


11. Quantity Ordered by Product Sub-Category

In [None]:
# Group quantity by sub-category
quantity_by_sub_category = df5.groupby('Sub-Category')['Quantity'].sum().reset_index()

# Plot quantity ordered by sub-category
plt.figure(figsize=(12,6))
sns.barplot(data=quantity_by_sub_category, x='Sub-Category', y='Quantity')
plt.xticks(rotation=45)
plt.title('Quantity Ordered by Product Sub-Category')
plt.show()


12. Sales Contribution by Category

In [None]:
# Group sales by category
sales_by_category = df5.groupby('Category')['Sales'].sum().reset_index()

# Plot pie chart for sales contribution
plt.figure(figsize=(8,8))
plt.pie(sales_by_category['Sales'], labels=sales_by_category['Category'], autopct='%1.1f%%')
plt.title('Sales Contribution by Product Category')
plt.show()


### 13. What is the monthly sales trend for the top product category?

In [None]:
# Identify the top product category by total sales
top_category = df5.groupby('Category')['Sales'].sum().idxmax()

# Filter data for the top category
top_category_sales = df5[df5['Category'] == top_category]

# Group by month for sales trend in the top category
monthly_sales_top_category = top_category_sales.groupby('Month')['Sales'].sum().reset_index()

# Plot the monthly sales trend for the top product category
plt.figure(figsize=(10,6))
sns.lineplot(data=monthly_sales_top_category, x='Month', y='Sales')
plt.title(f'Monthly Sales Trend for {top_category}')
plt.show()


### 14. Which shipping mode has the highest average profit margin?

In [None]:
# Calculate the profit approximation based on Sales and Discount
df5['Profit'] = df5['Sales'] - (df5['Sales'] * df5['Discount'])

# Calculate profit margin (Profit / Sales)
df5['Profit Margin'] = df5['Profit'] / df5['Sales']

# Group by shipping mode to get the average profit margin
avg_profit_margin_by_ship_mode = df5.groupby('Ship Mode')['Profit Margin'].mean().reset_index()

# Plot the average profit margin by shipping mode
plt.figure(figsize=(10,6))
sns.barplot(data=avg_profit_margin_by_ship_mode, x='Ship Mode', y='Profit Margin')
plt.title('Average Profit Margin by Shipping Mode')
plt.show()


### 15.Which product sub-category has the highest discount and is it profitable?

In [None]:
# Group by sub-category for average discount and profit
sub_category_discount_profit = df5.groupby('Sub-Category')[['Discount', 'Profit']].mean().reset_index()

# Find the sub-category with the highest average discount
highest_discount_sub_category = sub_category_discount_profit.sort_values(by='Discount', ascending=False).head(1)

# Plot the sub-category with highest discount and its profit
plt.figure(figsize=(10,6))
sns.barplot(data=sub_category_discount_profit, x='Sub-Category', y='Discount', color='blue')
plt.xticks(rotation=45)
plt.title('Average Discount by Product Sub-Category')

plt.figure(figsize=(10,6))
sns.barplot(data=sub_category_discount_profit, x='Sub-Category', y='Profit', color='green')
plt.xticks(rotation=45)
plt.title('Average Profit by Product Sub-Category')
plt.show()

highest_discount_sub_category


### 16. What is the most popular product category based on order quantity in each region?

In [None]:
# Group by category and region to get the total quantity
category_region_quantity = df5.groupby(['Region', 'Category'])['Quantity'].sum().reset_index()

# Plot the quantity ordered in each region by product category
plt.figure(figsize=(12,6))
sns.barplot(data=category_region_quantity, x='Region', y='Quantity', hue='Category')
plt.title('Product Category Popularity in Each Region (by Quantity Ordered)')
plt.show()


### 17.What is the relationship between sales and quantity ordered?

In [None]:
# Scatter plot for Sales vs Quantity ordered
plt.figure(figsize=(10,6))
sns.scatterplot(data=df5, x='Quantity', y='Sales', hue='Category', alpha=0.7)
plt.title('Relationship Between Sales and Quantity Ordered')
plt.show()

# Correlation between quantity and sales
df5[['Quantity', 'Sales']].corr()


### 18.Which customer segments receive the highest discounts?

In [None]:
# Check available columns again to choose the right one
df5.columns

# Assuming we want to calculate the average discount by 'Category'
avg_discount_by_category = df5.groupby('Category')['Discount'].mean().reset_index()

# Plot the average discount for each category
plt.figure(figsize=(10,6))
sns.barplot(data=avg_discount_by_category, x='Category', y='Discount')
plt.title('Average Discount by Category')
plt.show()


### 19.What are the sales trends in the top 3 regions?

In [None]:
# Find top 3 regions by total sales
top_3_regions = df5.groupby('Region')['Sales'].sum().sort_values(ascending=False).head(3).index

# Filter data for the top 3 regions
top_region_sales = df5[df5['Region'].isin(top_3_regions)]

# Group by year and region to get sales
sales_trends_top_regions = top_region_sales.groupby(['Year', 'Region'])['Sales'].sum().reset_index()

# Plot sales trends in top 3 regions
plt.figure(figsize=(12,6))
sns.lineplot(data=sales_trends_top_regions, x='Year', y='Sales', hue='Region')
plt.title('Sales Trends in Top 3 Regions Over Time')
plt.show()


### 20.What is the distribution of profits across product sub-categories?

In [None]:
# Group by sub-category for total profit
profit_by_sub_category = df5.groupby('Sub-Category')['Profit'].sum().reset_index()

# Plot the distribution of profits across sub-categories
plt.figure(figsize=(12,6))
sns.barplot(data=profit_by_sub_category, x='Sub-Category', y='Profit')
plt.xticks(rotation=45)
plt.title('Profit Distribution by Product Sub-Category')
plt.show()
