## Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

## Data Loading

In [2]:
df=pd.read_csv("ecommerce_customer_behavior_dataset.csv")
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High


## Data Pre-Processing

In [3]:
## Checking Missing Values 
df.isnull().sum()

Customer ID                    0
Age                            0
Gender                         0
Location                       0
Product Category               0
Purchase Amount ($)            0
Time Spent on Website (min)    0
Device Type                    0
Payment Method                 0
Discount Availed               0
Number of Items Purchased      0
Return Customer                0
Review Score (1-5)             0
Delivery Time (days)           0
Subscription Status            0
Customer Satisfaction          0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer ID                  10000 non-null  int64  
 1   Age                          10000 non-null  int64  
 2   Gender                       10000 non-null  object 
 3   Location                     10000 non-null  object 
 4   Product Category             10000 non-null  object 
 5   Purchase Amount ($)          10000 non-null  float64
 6   Time Spent on Website (min)  10000 non-null  int64  
 7   Device Type                  10000 non-null  object 
 8   Payment Method               10000 non-null  object 
 9   Discount Availed             10000 non-null  bool   
 10  Number of Items Purchased    10000 non-null  int64  
 11  Return Customer              10000 non-null  bool   
 12  Review Score (1-5)           10000 non-null  int64  
 13  Delivery Time (da

In [5]:
df.describe()

Unnamed: 0,Customer ID,Age,Purchase Amount ($),Time Spent on Website (min),Number of Items Purchased,Review Score (1-5),Delivery Time (days)
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,43.7899,503.892494,29.9184,4.9985,2.9951,7.013
std,2886.89568,15.005521,286.238084,17.002107,2.574988,1.404163,3.721561
min,1.0,18.0,5.19,1.0,1.0,1.0,1.0
25%,2500.75,31.0,254.9375,15.0,3.0,2.0,4.0
50%,5000.5,44.0,506.195,30.0,5.0,3.0,7.0
75%,7500.25,57.0,749.135,45.0,7.0,4.0,10.0
max,10000.0,69.0,999.98,59.0,9.0,5.0,13.0


Since the Data is clean we can deep dive into the analysis !

## **Level 1: Basic Insights**

In [6]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High


### Q1: Find Mean, Median, and Mode (Age)

In [7]:
# Calculate mean, median, and mode for Age
age_mean = df['Age'].mean()
age_median = df['Age'].median()
age_mode = df['Age'].mode()[0] 

print(age_mean)
print(age_median)
print(age_mode)

43.7899
44.0
51


In [8]:
import plotly.graph_objects as go

age_mean = df['Age'].mean()
age_median = df['Age'].median()
age_mode = df['Age'].mode()[0] 

# Creating a single figure
fig = go.Figure()

# Adding histogram
fig.add_trace(go.Histogram(x=df['Age'], name="Age Distribution", opacity=0.7))

# Adding vertical lines for mean, median, and mode
fig.add_vline(x=age_mean, line_dash="dash", line_color="red")
fig.add_vline(x=age_median, line_dash="dash", line_color="green")
fig.add_vline(x=age_mode, line_dash="dash", line_color="blue")

# Add annotations with better positioning
fig.add_annotation(
    x=age_mean, y=df['Age'].value_counts().max() * 0.9, 
    text=f"Mean: {age_mean:.2f}",
    arrowhead=2,
    bgcolor="red",
    font=dict(color="white"),
    bordercolor="red",
    borderwidth=2,
    borderpad=2,
    arrowcolor="red",
    ax=-80, ay=-20  
)

fig.add_annotation(
    x=age_median, y=df['Age'].value_counts().max() * 0.9,
    text=f"Median: {age_median:.2f}",
    arrowhead=2,
    bgcolor="green",
    font=dict(color="white"),
    bordercolor="green",
    borderwidth=2,
    borderpad=2,
    arrowcolor="green",
    ax=60, ay=-20  
)

fig.add_annotation(
    x=age_mode, y=df['Age'].value_counts().max() * 0.9,
    text=f"Mode: {age_mode}",
    arrowhead=2,
    bgcolor="blue",
    font=dict(color="white"),
    bordercolor="blue",
    borderwidth=2,
    borderpad=2,
    arrowcolor="blue",
    ax=-55, ay=25
)

# Update layout
fig.update_layout(
    title_text="Age Distribution with Mean, Median, and Mode",
    xaxis_title_text="Age",
    yaxis_title_text="Count",
    bargap=0.2,
    showlegend=False
)

# Show the plot
fig.show()

# Save the plot as an HTML file
#fig.write_html("age_analysis_combined.html")
#print("Visualization saved as age_analysis_combined.html")




## Q2: Find variance, standard deviation, and z-score (Purchase Amount)

In [9]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High


In [10]:
# Calculate variance and standard deviation
variance = df['Purchase Amount ($)'].var()
standard_deviation = df['Purchase Amount ($)'].std()

# Calculate z-scores
mean_purchase_amount = df['Purchase Amount ($)'].mean()
df['Z-Score'] = (df['Purchase Amount ($)'] - mean_purchase_amount) / standard_deviation

# Display results
print(f"Variance of Purchase Amount: {variance:.2f}")
print(f"Standard Deviation of Purchase Amount: {standard_deviation:.2f}")
print("Z-Scores for Purchase Amount:")
df[['Purchase Amount ($)', 'Z-Score']]


Variance of Purchase Amount: 81932.24
Standard Deviation of Purchase Amount: 286.24
Z-Scores for Purchase Amount:


Unnamed: 0,Purchase Amount ($),Z-Score
0,202.54,-1.052804
1,655.94,0.531192
2,963.65,1.606207
3,485.59,-0.063942
4,143.27,-1.259869
...,...,...
9995,99.23,-1.413727
9996,797.81,1.026829
9997,820.17,1.104946
9998,34.94,-1.638330


In [188]:
df["Z-Score"].max()

1.7331289361392022

In [192]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the dataset
df = pd.read_csv('ecommerce_customer_behavior_dataset.csv')

# Calculate variance, standard deviation, and z-score for Purchase Amount
purchase_amount = df['Purchase Amount ($)']
variance = np.var(purchase_amount)
std_dev = np.std(purchase_amount)
mean_purchase_amount = np.mean(purchase_amount)
z_scores = (purchase_amount - mean_purchase_amount) / std_dev

# Create a subplot with 3 rows
fig = make_subplots(rows=3, cols=1, 
                    subplot_titles=("Purchase Amount Distribution (Variance)",
                                    "Purchase Amount Distribution (Standard Deviation)",
                                    "Z-Scores Distribution"),
                    vertical_spacing=0.1)

# Row 1: Purchase Amount Distribution (Variance)
fig.add_trace(go.Histogram(x=purchase_amount, name='Purchase Amount', 
                             marker_color='lightblue', opacity=0.75), row=1, col=1)
fig.add_vline(x=mean_purchase_amount, line_dash="dash", line_color="red", row=1, col=1, 
               annotation_text="Mean", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount + np.sqrt(variance), line_dash="dash", line_color="blue", row=1, col=1,
               annotation_text="Mean + √Variance", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount - np.sqrt(variance), line_dash="dash", line_color="blue", row=1, col=1,
               annotation_text="Mean - √Variance", annotation_position="top right")

# Row 2: Purchase Amount Distribution (Standard Deviation)
fig.add_trace(go.Histogram(x=purchase_amount, name='Purchase Amount', 
                             marker_color='lightgreen', opacity=0.75), row=2, col=1)
fig.add_vline(x=mean_purchase_amount, line_dash="dash", line_color="red", row=2, col=1,
               annotation_text="Mean", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount + std_dev, line_dash="dash", line_color="green", row=2, col=1,
               annotation_text="Mean + 1 SD", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount - std_dev, line_dash="dash", line_color="green", row=2, col=1,
               annotation_text="Mean - 1 SD", annotation_position="top right")

# Row 3: Z-Scores distribution
fig.add_trace(go.Histogram(x=z_scores, name='Z-Scores', 
                             marker_color='salmon', opacity=0.75), row=3, col=1)

# Update layout
fig.update_layout(height=1200, width=1000, title_text="Purchase Amount Analysis",
                  showlegend=False)

# Add annotations for variance, standard deviation, and z-score range
fig.add_annotation(x=0.95, y=0.95, xref="paper", yref="paper",
                   text=f"Variance: {variance:.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")
fig.add_annotation(x=0.95, y=0.62, xref="paper", yref="paper",
                   text=f"Standard Deviation: {std_dev:.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")
fig.add_annotation(x=0.95, y=0.29, xref="paper", yref="paper",
                   text=f"Z-score range: {z_scores.min():.2f} to {z_scores.max():.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")

# Show plot
fig.show()

# Print summary statistics
print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Z-score range: {z_scores.min():.2f} to {z_scores.max():.2f}")


Variance: 81924.05
Standard Deviation: 286.22
Z-score range: -1.74 to 1.73


In [193]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the dataset
df = pd.read_csv('ecommerce_customer_behavior_dataset.csv')

# Calculate variance, standard deviation, and z-score for Purchase Amount
purchase_amount = df['Purchase Amount ($)']
variance = np.var(purchase_amount)
std_dev = np.std(purchase_amount)
mean_purchase_amount = np.mean(purchase_amount)
z_scores = (purchase_amount - mean_purchase_amount) / std_dev

# Create a subplot with 3 rows
fig = make_subplots(rows=3, cols=1, 
                    subplot_titles=("Purchase Amount Distribution (Variance)",
                                    "Purchase Amount Distribution (Standard Deviation)",
                                    "Z-Scores Distribution"),
                    vertical_spacing=0.1)

# Define colors from Set1 palette
colors = ['#E41A1C', '#377EB8', '#4DAF4A']  # Colors from Set1

# Row 1: Purchase Amount Distribution (Variance)
fig.add_trace(go.Histogram(x=purchase_amount, name='Purchase Amount', 
                             marker_color=colors[0], opacity=0.75), row=1, col=1)
fig.add_vline(x=mean_purchase_amount, line_dash="dash", line_color="black", row=1, col=1, 
               annotation_text="Mean", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount + np.sqrt(variance), line_dash="dash", line_color=colors[1], row=1, col=1,
               annotation_text="Mean + √Variance", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount - np.sqrt(variance), line_dash="dash", line_color=colors[1], row=1, col=1,
               annotation_text="Mean - √Variance", annotation_position="top right")

# Row 2: Purchase Amount Distribution (Standard Deviation)
fig.add_trace(go.Histogram(x=purchase_amount, name='Purchase Amount', 
                             marker_color=colors[2], opacity=0.75), row=2, col=1)
fig.add_vline(x=mean_purchase_amount, line_dash="dash", line_color="black", row=2, col=1,
               annotation_text="Mean", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount + std_dev, line_dash="dash", line_color=colors[1], row=2, col=1,
               annotation_text="Mean + 1 SD", annotation_position="top right")
fig.add_vline(x=mean_purchase_amount - std_dev, line_dash="dash", line_color=colors[1], row=2, col=1,
               annotation_text="Mean - 1 SD", annotation_position="top right")

# Row 3: Z-Scores distribution
fig.add_trace(go.Histogram(x=z_scores, name='Z-Scores', 
                             marker_color=colors[0], opacity=0.75), row=3, col=1)

# Update layout
fig.update_layout(height=1200, width=1000, title_text="Purchase Amount Analysis",
                  showlegend=False)

# Add annotations for variance, standard deviation, and z-score range
fig.add_annotation(x=0.95, y=0.95, xref="paper", yref="paper",
                   text=f"Variance: {variance:.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")
fig.add_annotation(x=0.95, y=0.62, xref="paper", yref="paper",
                   text=f"Standard Deviation: {std_dev:.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")
fig.add_annotation(x=0.95, y=0.29, xref="paper", yref="paper",
                   text=f"Z-score range: {z_scores.min():.2f} to {z_scores.max():.2f}",
                   showarrow=False, align="right", bgcolor="rgba(255,255,255,0.8)")

# Show plot
fig.show()

# Print summary statistics
print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Z-score range: {z_scores.min():.2f} to {z_scores.max():.2f}")


Variance: 81924.05
Standard Deviation: 286.22
Z-score range: -1.74 to 1.73


### Q3: What are the top three product categories based on the number of purchases?

In [12]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [13]:
top_3=df.groupby("Product Category")["Number of Items Purchased"].sum().sort_values(ascending=False)

In [14]:
top_3

Product Category
Toys           6580
Books          6533
Electronics    6469
Clothing       6359
Home           6214
Groceries      6067
Beauty         6057
Sports         5706
Name: Number of Items Purchased, dtype: int64

In [16]:
import plotly.express as px

# Create the bar chart
fig = px.bar(
    top_3,
    x=top_3.index,
    y=top_3.values,
    labels={'x': 'Product Category', 'y': 'Count'},
    title='Top 3 Product Categories',
    color=top_3.index,
)

# Annotations for the top 3 categories
annotations = ['1st', '2nd', '3rd']

# Ensure we only add annotations for the number of bars available (up to 3)
for i in range(min(3, len(top_3))):  # Iterate only for the number of available bars (max 3)
    fig.add_annotation(
        x=top_3.index[i],  # Category label
        y=top_3.values[i],  # Value of the bar
        text=f'{annotations[i]} Place',  # Add custom ranking labels (1st, 2nd, 3rd)
        showarrow=True,  # Show an arrow pointing to the bar
        arrowhead=2,
        ax=0, ay=-40,  # Adjust arrow positioning
        font=dict(size=14, color="white"),  # Font size and color
        bgcolor="red",  # Background color of the annotation box
        bordercolor="red",  # Border color for visibility
        borderwidth=2,  # Border width
        borderpad=4,  # Padding within the annotation box
        arrowcolor="red"  # Arrow color matching the box
    )

# Show the plot
fig.show()


### Q4: How many customers are classified as return customers?

In [17]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [91]:
df['Return Customer'].value_counts()

Return Customer
False    5004
True     4996
Name: count, dtype: int64

In [89]:
return_customer=df.groupby("Gender")['Return Customer'].value_counts()
return_customer

Gender  Return Customer
Female  False              1708
        True               1630
Male    False              1688
        True               1661
Other   True               1705
        False              1608
Name: count, dtype: int64

In [194]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# First chart: Count of Return Customers vs. Non-Return Customers
return_customer_counts = df['Return Customer'].value_counts()
total_customers = return_customer_counts.sum()
return_customer_percentages = (return_customer_counts / total_customers * 100).round(2)  # Calculate percentages to 2 decimal places

# Second chart: Distribution of Return Customers by Gender
return_customers_by_gender = df[df['Return Customer'] == True].groupby('Gender').size().reset_index(name='Count')
total_return_customers = return_customers_by_gender['Count'].sum()
return_customers_by_gender['Percentage'] = (return_customers_by_gender['Count'] / total_return_customers * 100).round(2)

# Create subplots with 2 rows and 1 column
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=(
        'Count of Return Customers vs. Non-Return Customers', 
        'Distribution of Return Customers by Gender'
    ),
    vertical_spacing=0.15  # Increase space between subplots
)

# First chart (Return Customers vs. Non-Return Customers) - add trace to first row
fig.add_trace(
    go.Bar(
        x=return_customer_counts.index.map({True: 'Return Customers', False: 'Non-Return Customers'}), 
        y=return_customer_counts.values, 
        marker_color=px.colors.qualitative.Set1, 
        text=[f'{count} ({percent:.2f}%)' for count, percent in zip(return_customer_counts.values, return_customer_percentages)],  # Add count and percentage (2 decimal points)
        textposition='auto',
        name="Return vs Non-Return"
    ),
    row=1, col=1
)

# Second chart (Return Customers by Gender) - add trace to second row
fig.add_trace(
    go.Bar(
        x=return_customers_by_gender['Gender'], 
        y=return_customers_by_gender['Count'], 
        marker_color=px.colors.qualitative.Set1, 
        text=[f'{count} ({percent:.2f}%)' for count, percent in zip(return_customers_by_gender['Count'], return_customers_by_gender['Percentage'])],  # Add count and percentage (2 decimal points)
        textposition='auto',
        name="Return Customers by Gender"
    ),
    row=2, col=1
)

# Update layout and customize aesthetics
fig.update_layout(
    height=800,  # Height of the plot
    title_text="Return Customer Analysis",  # Overall title
    title_x=0.3,  # Center title
    title_font=dict(size=24, family='Arial, bold'),  # Make the main title bigger
    showlegend=False,  # No need for legend since charts are separate
    plot_bgcolor='white',  # Background color
    xaxis=dict(showgrid=False),  # No vertical gridlines
    yaxis=dict(showgrid=False),  # No horizontal gridlines
    margin=dict(l=50, r=50, t=100, b=50),  # Margins for clarity
    font=dict(family="Arial", size=14)  # General font for text
)

# Improve the look of the bars with lines
fig.update_traces(marker_line_color='black', marker_line_width=1.5)

# Change subplot title styles
fig.update_annotations(
    dict(
        font_size=18, 
        font_family="Arial, bold",
        yshift=10  # Shift titles upwards for clarity
    )
)

# Add background shading to the second plot for contrast
fig.add_shape(
    type="rect",
    xref="paper", yref="paper",
    x0=0, y0=0, x1=1, y1=0.45,  # Covers the second plot area
    fillcolor="lightgrey", opacity=0.2, line_width=0
)

# Display the combined chart
fig.show()


### Q5: What is the average review score given by customers?

In [23]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [24]:
df["Review Score (1-5)"].mean()

2.9951

In [112]:
import pandas as pd
import plotly.graph_objects as go

# Calculate mean for Review Score
review_mean = df['Review Score (1-5)'].mean()

# Creating a single figure
fig = go.Figure()

# Adding histogram for Review Scores
fig.add_trace(go.Histogram(x=df['Review Score (1-5)'], name="Review Score Distribution", opacity=0.7))

# Adding vertical line for mean
fig.add_vline(x=review_mean, line_dash="dash", line_color="red")

# Add annotation for mean
fig.add_annotation(
    x=review_mean, 
    y=df['Review Score (1-5)'].value_counts().max(), 
    text=f"Average Review Score: {review_mean}",
    arrowhead=2,
    bgcolor="red",
    font=dict(color="white"),
    bordercolor="red",
    borderwidth=2,
    borderpad=2,
    arrowcolor="red",
    ax=-60, 
    ay=-20  
)

# Update layout
fig.update_layout(
    title_text="Average Review Score Distribution",
    xaxis_title_text="Review Score (1-5)",
    yaxis_title_text="Count",
    bargap=0.2,
    showlegend=False
)

# Show the plot
fig.show()

# Save the plot as an HTML file (optional)
# fig.write_html("review_score_analysis.html")
# print("Visualization saved as review_score_analysis.html")


### Q6: How does the average delivery time vary between subscription statuses (Free,Premium)?

In [27]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [28]:
df.groupby("Subscription Status")["Delivery Time (days)"].mean()

Subscription Status
Free       6.963924
Premium    7.070347
Trial      7.004272
Name: Delivery Time (days), dtype: float64

In [29]:
import plotly.express as px

# Create a DataFrame for visualization
avg_delivery_time = df.groupby("Subscription Status")["Delivery Time (days)"].mean().reset_index()

# Create a vertical bar chart
bar_fig = px.bar(
    avg_delivery_time,
    x='Delivery Time (days)',
    y='Subscription Status',
    title='Average Delivery Time by Subscription Status',
    labels={'Delivery Time (days)': 'Average Delivery Time (days)', 'Subscription Status': 'Subscription Status'},
    color='Subscription Status',
    color_discrete_sequence=px.colors.qualitative.Set1,  # Use a color sequence
    orientation='h'  # Horizontal bar chart
)

# Show the vertical bar chart
bar_fig.show()


### Q7: How many customers are subscribed to the service?

In [30]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [31]:
subscription_counts=df["Subscription Status"].value_counts().reset_index()

In [32]:
import plotly.express as px

# Prepare the data for visualization
subscription_counts = df["Subscription Status"].value_counts().reset_index()
subscription_counts.columns = ['Subscription Status', 'Count']

# Create a bar chart
bar_fig = px.bar(
    subscription_counts,
    x='Subscription Status',
    y='Count',
    title='Number of Customers by Subscription Status',
    labels={'Count': 'Number of Customers'},
    color='Subscription Status',
    color_discrete_sequence=px.colors.qualitative.Set1,
)

# Show the bar chart
bar_fig.show()


### Q8: What percentage of customers used devices to make purchases? (Mobile, Desktop,Tablet)

In [33]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [34]:
df["Device Type"].value_counts()

Device Type
Mobile     3374
Desktop    3348
Tablet     3278
Name: count, dtype: int64

In [35]:
device_counts = df["Device Type"].value_counts()
total_counts = device_counts.sum()
device_percentages = (device_counts / total_counts) * 100
device_percentages

Device Type
Mobile     33.74
Desktop    33.48
Tablet     32.78
Name: count, dtype: float64

In [117]:
# Create a DataFrame for percentages
device_percentage_df = pd.DataFrame({
    'Device Type': device_counts.index,
    'Percentage': device_percentages.values
})

# Step 3: Create a pie chart to visualize the percentages
pie_fig = px.pie(
    device_percentage_df,
    values='Percentage',
    names='Device Type',
    title='Percentage of Customers by Device Type',
    color='Device Type',
    color_discrete_sequence=px.colors.qualitative.Set1,
    hole=0.3  # To make it a donut chart
)

# Add annotations to the pie chart
pie_fig.update_traces(textinfo='percent+label', textfont_size=14)

# Update layout for better aesthetics
pie_fig.update_layout(
    title_font_size=20,
    legend_title_text='Device Type'
    #legend=dict(x=0.8, y=0.5)  # Positioning the legend
)

# Show the pie chart
pie_fig.show()


### Q9: What is the average purchase amount for customers who availed discounts compared to those who didn’t?

In [38]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [39]:
df.groupby("Discount Availed")["Purchase Amount ($)"].mean()

Discount Availed
False    502.511451
True     505.264178
Name: Purchase Amount ($), dtype: float64

In [128]:
avg_purchase = df.groupby("Discount Availed")["Purchase Amount ($)"].mean().reset_index()

# Map True/False to meaningful labels
avg_purchase['Discount Availed'] = avg_purchase['Discount Availed'].map({True: 'Discount', False: 'No Discount'})

# Create a bar chart
bar_fig = px.bar(
    avg_purchase,
    x='Discount Availed',
    y='Purchase Amount ($)',
    title='Average Purchase Amount Based on Discount Availed',
    labels={'Discount Availed': 'Discount Status', 'Purchase Amount ($)': 'Average Amount'},
    color='Discount Availed',
    color_discrete_sequence=px.colors.qualitative.Set1,
)

# Adding data labels on top of the bars
bar_fig.update_traces(
    texttemplate='%{y:.2f}', 
    textposition='outside',
    marker=dict(line=dict(width=1, color='black')),
    textfont_size=12  # Adjust text size for labels
)

# Update layout for better aesthetics
bar_fig.update_layout(
    title_font_size=20,  # Title font size
    xaxis_title_font_size=14,  # X-axis title font size
    yaxis_title_font_size=14,  # Y-axis title font size
    font=dict(size=12),  # General font size
    xaxis=dict(title=dict(standoff=10)),  # Space between x-axis title and labels
    yaxis=dict(title=dict(standoff=10)),  # Space between y-axis title and labels
    showlegend=False,  # Hide the legend since it's not needed
    height=500  # Adjust height for better visibility
)

# Show the bar chart
bar_fig.show()


### Q10: What is the most common payment method used by customers?

In [41]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [42]:
df["Payment Method"].value_counts().sort_values(ascending=False)

Payment Method
Bank Transfer       2067
Credit Card         2028
Cash on Delivery    2007
Debit Card          1983
PayPal              1915
Name: count, dtype: int64

In [149]:

payment_counts = df["Payment Method"].value_counts().reset_index()
payment_counts.columns = ['Payment Method', 'Count']

# Create a bar chart
payment_fig = px.bar(
    payment_counts,
    x='Payment Method',
    y='Count',
    title='Most Common Payment Methods Used by Customers',
    labels={'Payment Method': 'Payment Method', 'Count': 'Number of Customers'},
    color='Payment Method',
    color_continuous_scale=px.colors.sequential.Viridis,
)

# Adding annotation for the most common payment method
most_common_payment = payment_counts.iloc[0]  # Get the most common payment method
payment_fig.add_annotation(
    x=most_common_payment['Payment Method'], 
    y=most_common_payment['Count'],
    text=f"Most Common: {most_common_payment['Payment Method']}",
    arrowhead=2,
    bgcolor="red",
    font=dict(color="white"),
    bordercolor="red",
    borderwidth=2,
    borderpad=2,
    arrowcolor="red",
    ax=0,  # Adjust position as needed
    ay=-40  # Adjust position as needed
)

# Update layout for better aesthetics
payment_fig.update_layout(
    title_font_size=20,  # Title font size
    xaxis_title_font_size=14,  # X-axis title font size
    yaxis_title_font_size=14,  # Y-axis title font size
    font=dict(size=12),  # General font size
    showlegend=False,  # Hide the legend since it's not needed
    height=400  # Adjust height for better visibility
)

# Show the bar chart
payment_fig.show()


## **Level 2: Intermediate Insights**

### Q1: What are the average review scores of users of the most common payment method?

In [44]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [45]:
df.groupby("Payment Method")["Review Score (1-5)"].mean()

Payment Method
Bank Transfer       2.988873
Cash on Delivery    3.019930
Credit Card         2.970414
Debit Card          2.996974
PayPal              3.000000
Name: Review Score (1-5), dtype: float64

In [151]:
# Calculate the average review score for each payment method
average_review_scores = df.groupby("Payment Method")["Review Score (1-5)"].mean().reset_index()

# Create a bar chart for average review scores
average_review_fig = px.bar(
    average_review_scores,
    x='Payment Method',
    y='Review Score (1-5)',
    title='Average Review Scores by Payment Method',
    labels={'Review Score (1-5)': 'Average Review Score'},
    color='Payment Method',
    color_continuous_scale=px.colors.sequential.YlGnBu,  # Changed to a more intuitive color scale
)

# Sort the average review scores based on the order of payment counts
payment_counts = df["Payment Method"].value_counts().index.tolist()  # Get the order of payment methods
average_review_fig.update_xaxes(categoryorder='array', categoryarray=payment_counts)  # Rearrange x-axis

# Add annotation for the most common payment method with specific styling
most_common_payment_method = payment_counts[0]  # The first in the ordered list
mean_review_score = average_review_scores[average_review_scores['Payment Method'] == most_common_payment_method]['Review Score (1-5)'].values[0]

average_review_fig.add_annotation(
    x=most_common_payment_method,
    y=mean_review_score,
    text="Avg. Review (Most Common): {:.2f}".format(mean_review_score),
    arrowhead=2,
    bgcolor="red",
    font=dict(color="white"),
    bordercolor="red",
    borderwidth=2,
    borderpad=2,
    arrowcolor="red",
    ax=0,  # Adjust position as needed
    ay=-40  # Adjust position as needed
)


# Update layout for better aesthetics
average_review_fig.update_layout(
    title_font_size=20,  # Title font size
    xaxis_title_font_size=14,  # X-axis title font size
    yaxis_title_font_size=14,  # Y-axis title font size
    font=dict(size=12),  # General font size
    height=400,  # Adjust height for better visibility
    yaxis_tickformat='.1f'  # Format y-axis ticks to one decimal place
)

# Show the bar chart
average_review_fig.show()


### Q2: What is the correlation between time spent on the website and purchase amount? Do customers who spend more time on the website purchase more items?

In [48]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [49]:
df.columns

Index(['Customer ID', 'Age', 'Gender', 'Location', 'Product Category',
       'Purchase Amount ($)', 'Time Spent on Website (min)', 'Device Type',
       'Payment Method', 'Discount Availed', 'Number of Items Purchased',
       'Return Customer', 'Review Score (1-5)', 'Delivery Time (days)',
       'Subscription Status', 'Customer Satisfaction', 'Z-Score'],
      dtype='object')

In [50]:
# Calculate the correlation coefficient
correlation = df["Time Spent on Website (min)"].corr(df["Purchase Amount ($)"])
print(f"Correlation between time spent on website and purchase amount: {correlation}")


Correlation between time spent on website and purchase amount: 0.010024809958752734


In [196]:
fig = px.scatter(
    df, 
    x='Time Spent on Website (min)',  
    y='Purchase Amount ($)',  
    title='Correlation between Time Spent on Website and Purchase Amount',
    labels={
        'Time Spent on Website (min)': 'Time Spent on Website (min)', 
        'Purchase Amount ($)': 'Purchase Amount ($)'
    },
    size='Purchase Amount ($)', 
    color='Time Spent on Website (min)',  
    color_discrete_sequence=px.colors.qualitative.Set1
    
)

# Customizing axis labels and limits
fig.update_layout(
    xaxis_title='Time Spent on Website (min)',
    yaxis_title='Purchase Amount ($)',
    width=1700  # Set canvas width
   
)

# Show the plot
fig.show()


### Q3: What percentage of customers are satisfied (rating of 4 or 5) and are also return customers?

In [55]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [56]:
satisfied_customers=df[(df["Review Score (1-5)"] >= 4) & (df["Return Customer"] == True)]

In [57]:
num_satisfied_return_customers=len(satisfied_customers)

In [58]:
total_customers=len(df)

In [59]:
# Calculate the percentage of satisfied return customers
percentage_satisfied_return_customers = (num_satisfied_return_customers / total_customers) * 100

# Print the result
print(f"Percentage of satisfied return customers: {percentage_satisfied_return_customers:.2f}%")

Percentage of satisfied return customers: 20.08%


In [60]:
# Calculate satisfied return customers
satisfied_return_customers = df[(df["Review Score (1-5)"] >= 4) & (df["Return Customer"] == True)]
num_satisfied_return_customers = len(satisfied_return_customers)

# Total return customers
total_return_customers = len(df[df["Return Customer"] == True])

# Total customers
total_customers = len(df)

# Calculate the percentages
percentage_satisfied_return_customers = (num_satisfied_return_customers / total_return_customers) * 100
percentage_satisfied_customers = (num_satisfied_return_customers / total_customers) * 100

# Print the results
print(f"Percentage of satisfied return customers: {percentage_satisfied_return_customers:.2f}%")
print(f"Percentage of satisfied customers among all customers: {percentage_satisfied_customers:.2f}%")


Percentage of satisfied return customers: 40.19%
Percentage of satisfied customers among all customers: 20.08%


In [61]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data for the pie charts
satisfied_return_count = num_satisfied_return_customers
not_satisfied_return_count = total_return_customers - num_satisfied_return_customers
satisfied_total_count = num_satisfied_return_customers
not_satisfied_total_count = total_customers - num_satisfied_return_customers

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'pie'}]],
                    subplot_titles=('Return Customers', 'All Customers'))

# Pie chart for return customers
fig.add_trace(go.Pie(
    labels=['Satisfied Return Customers', 'Not Satisfied Return Customers'],
    values=[satisfied_return_count, not_satisfied_return_count],
    hole=.3,
    marker=dict(colors=['green', 'red']),
), row=1, col=1)

# Pie chart for all customers
fig.add_trace(go.Pie(
    labels=['Satisfied & Returning Customers', 'Other Customers'],
    values=[satisfied_total_count, not_satisfied_total_count],
    hole=.3,
    marker=dict(colors=['blue', 'orange']),
), row=1, col=2)

# Update layout
fig.update_layout(
    title_text='Customer Satisfaction Analysis',
    height=400,  # Adjust height if needed
)

# Show the figure
fig.show()


In [62]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data for the pie charts
satisfied_return_count = num_satisfied_return_customers
not_satisfied_return_count = total_return_customers - num_satisfied_return_customers
satisfied_total_count = num_satisfied_return_customers
not_satisfied_total_count = total_customers - num_satisfied_return_customers

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'pie'}]],
                    subplot_titles=('Return Customers', 'Total Customer Satisfaction'))

# Pie chart for return customers
fig.add_trace(go.Pie(
    labels=['Satisfied Return Customers', 'Unsatisfied Return Customers'],
    values=[satisfied_return_count, not_satisfied_return_count],
    hole=.3,
    marker=dict(colors=['green', 'red']),
    textinfo='percent+label'  # Show percentage and label
), row=1, col=1)

# Pie chart for all customers
fig.add_trace(go.Pie(
    labels=['Satisfied Customers', 'Unsatisfied or Non-returning Customers'],
    values=[satisfied_total_count, not_satisfied_total_count],
    hole=.3,
    marker=dict(colors=['blue', 'orange']),
    textinfo='percent+label'  # Show percentage and label
), row=1, col=2)

# Update layout
fig.update_layout(
    title_text='Customer Satisfaction Analysis',
    height=400,  # Adjust height if needed
)

# Show the figure
fig.show()


In [64]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming 'df' is your DataFrame containing customer data
# Sample DataFrame creation (you should replace this with your actual DataFrame)
# df = pd.read_csv('your_data.csv')  # Uncomment this and load your actual data

# Replace these with your actual calculations
num_satisfied_return_customers = len(df[(df["Review Score (1-5)"] >= 4) & (df["Return Customer"] == True)])
total_return_customers = len(df[df["Return Customer"] == True])
total_customers = len(df)

# Data for the pie charts
satisfied_return_count = num_satisfied_return_customers
not_satisfied_return_count = total_return_customers - satisfied_return_count
satisfied_total_count = (df["Review Score (1-5)"] >= 4).sum()  # Total satisfied customers
not_satisfied_total_count = total_customers - satisfied_total_count

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'pie'}]],
                    subplot_titles=('Return Customers', 'Total Customer Satisfaction'))

# Pie chart for return customers
fig.add_trace(go.Pie(
    labels=['Satisfied Return Customers', 'Unsatisfied Return Customers'],
    values=[satisfied_return_count, not_satisfied_return_count],
    hole=.3,
    marker=dict(colors=['green', 'red']),
    textinfo='label+percent',  # Show percentage and label
    textposition='outside',  # Position text outside the pie
    
), row=1, col=1)

# Pie chart for all customers
fig.add_trace(go.Pie(
    labels=['Satisfied Customers', 'Unsatisfied or Non-returning Customers'],
    values=[satisfied_total_count, not_satisfied_total_count],
    hole=.3,
    marker=dict(colors=['blue', 'orange']),
    textinfo='label+percent',  # Show percentage and label
    textposition='outside',  # Position text outside the pie
  
), row=1, col=2)

# Update layout
fig.update_layout(
    title_text='Customer Satisfaction Analysis',
    height=400,  # Adjust height if needed
)

# Show the figure
fig.show()


### Q4: What is the relationship between the number of items purchased and customer satisfaction?

In [66]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [197]:
import plotly.express as px

# Create a scatter plot for the relationship between items purchased and customer satisfaction
fig = px.scatter(
    df,
    x='Customer Satisfaction',  # Number of items purchased
    y='Number of Items Purchased',  # Customer satisfaction score Number of Items Purchased
    title='Correlation between Number of Items Purchased and Customer Satisfaction',
    labels={
        'Number of Items Purchased': 'Number of Items Purchased',
        'Review Score (1-5)': 'Customer Satisfaction (1-5)'
    },
    size='Number of Items Purchased',  # Optional: Size based on the number of items purchased
    color='Customer Satisfaction',  # Color based on customer satisfaction
    color_discrete_sequence=px.colors.qualitative.Set1,
    
)

# Customize axis labels and limits
fig.update_layout(
    xaxis_title='Customer Satisfaction Level',
    yaxis_title='Number of Items Purchased'
   
)

# Show the plot
fig.show()


### Q5: Which location has the 2nd highest average purchase amount?

In [70]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,0.531192
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,-0.063942
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,-1.259869


In [73]:
# Calculate average purchase amount by location
#average_purchase = df.groupby("Location")["Purchase Amount ($)"].mean().reset_index()
#average_purchase


Unnamed: 0,Location,Purchase Amount ($)
0,Barisal,513.67456
1,Chittagong,507.851675
2,Dhaka,502.002504
3,Khulna,513.937072
4,Mymensingh,507.892211
5,Rajshahi,495.544649
6,Rangpur,494.368867
7,Sylhet,494.976427


In [74]:
# DataFrame for Latitude and Longitude
#location_coordinates = {
    #'Location': ['Khulna', 'Barisal', 'Mymensingh', 'Chittagong', 'Dhaka', 'Rajshahi', 'Sylhet', 'Rangpur'],
    #'Latitude': [22.8101, 22.7011, 24.7471, 22.3569, 23.8103, 24.3745, 24.0634, 25.7520],
    #'Longitude': [89.5403, 90.3633, 90.4151, 91.7832, 90.4125, 88.6045, 91.8986, 88.6230]
#}

#location_df = pd.DataFrame(location_coordinates)
#location_df

Unnamed: 0,Location,Latitude,Longitude
0,Khulna,22.8101,89.5403
1,Barisal,22.7011,90.3633
2,Mymensingh,24.7471,90.4151
3,Chittagong,22.3569,91.7832
4,Dhaka,23.8103,90.4125
5,Rajshahi,24.3745,88.6045
6,Sylhet,24.0634,91.8986
7,Rangpur,25.752,88.623


In [75]:
# Merge average purchase with location coordinates
#merged_df = pd.merge(average_purchase, location_df, on="Location")

#merged_df

Unnamed: 0,Location,Purchase Amount ($),Latitude,Longitude
0,Barisal,513.67456,22.7011,90.3633
1,Chittagong,507.851675,22.3569,91.7832
2,Dhaka,502.002504,23.8103,90.4125
3,Khulna,513.937072,22.8101,89.5403
4,Mymensingh,507.892211,24.7471,90.4151
5,Rajshahi,495.544649,24.3745,88.6045
6,Rangpur,494.368867,25.752,88.623
7,Sylhet,494.976427,24.0634,91.8986


In [76]:
# Get the location with the second highest average purchase amount
#second_highest_location = merged_df.nlargest(2, "Purchase Amount ($)").iloc[-1]
#second_highest_location


Location                 Barisal
Purchase Amount ($)    513.67456
Latitude                 22.7011
Longitude                90.3633
Name: 0, dtype: object

In [179]:
import plotly.express as px

# Corrected coordinates for cities in Bangladesh
location_coordinates = {
    'Dhaka': {'lat': 23.8103, 'lon': 90.4125},
    'Chittagong': {'lat': 22.3569, 'lon': 91.7832},
    'Khulna': {'lat': 22.8456, 'lon': 89.5403},
    'Rajshahi': {'lat': 24.3745, 'lon': 88.6042},  
    'Sylhet': {'lat': 24.8978, 'lon': 91.8714}, 
    'Barisal': {'lat': 22.7010, 'lon': 90.3535},
    'Rangpur': {'lat': 25.7439, 'lon': 89.2752},
    'Mymensingh': {'lat': 24.7471, 'lon': 90.4203}
}

# Add the latitude and longitude to the DataFrame
df['Latitude'] = df['Location'].map(lambda x: location_coordinates[x]['lat'])
df['Longitude'] = df['Location'].map(lambda x: location_coordinates[x]['lon'])

# Calculate average purchase amount by location
average_purchase = df.groupby("Location")["Purchase Amount ($)"].mean().reset_index()

# Merge average purchase with location coordinates
merged_df = pd.merge(average_purchase, df[['Location', 'Latitude', 'Longitude']].drop_duplicates(), on="Location")

# Get the location with the second highest average purchase amount
second_highest_location = merged_df.nlargest(2, "Purchase Amount ($)").iloc[-1]

# Create a heatmap using the average purchase amount
fig = px.scatter_mapbox(
    merged_df, 
    lat='Latitude', 
    lon='Longitude', 
    size='Purchase Amount ($)',  # Use average purchase amount for bubble size
    color='Purchase Amount ($)',  # Use average purchase amount for color
    hover_name='Location',
    hover_data={'Latitude': False, 'Longitude': False, 'Purchase Amount ($)': True},
    title='Average Purchase Amount Heatmap by Location in Bangladesh',
    zoom=6,  # Zoom level focused on Bangladesh
    center={'lat': 23.685, 'lon': 90.3563},  # Center of Bangladesh
    height=600
)



# Use open-street-map for the background map style
fig.update_layout(mapbox_style="open-street-map")

# Show the map
fig.show()


In [178]:
import plotly.graph_objects as go
import plotly.express as px

# Corrected coordinates for cities in Bangladesh
location_coordinates = {
    'Dhaka': {'lat': 23.8103, 'lon': 90.4125},
    'Chittagong': {'lat': 22.3569, 'lon': 91.7832},
    'Khulna': {'lat': 22.8456, 'lon': 89.5403},
    'Rajshahi': {'lat': 24.3745, 'lon': 88.6042},  # Adjusted Rajshahi's longitude
    'Sylhet': {'lat': 24.8978, 'lon': 91.8714},  # Correct Sylhet coordinates
    'Barisal': {'lat': 22.7010, 'lon': 90.3535},
    'Rangpur': {'lat': 25.7439, 'lon': 89.2752},
    'Mymensingh': {'lat': 24.7471, 'lon': 90.4203}
}

# Add the latitude and longitude to the DataFrame
df['Latitude'] = df['Location'].map(lambda x: location_coordinates[x]['lat'])
df['Longitude'] = df['Location'].map(lambda x: location_coordinates[x]['lon'])

# Calculate average purchase amount by location
average_purchase = df.groupby("Location")["Purchase Amount ($)"].mean().reset_index()

# Merge average purchase with location coordinates
merged_df = pd.merge(average_purchase, df[['Location', 'Latitude', 'Longitude']].drop_duplicates(), on="Location")

# Get the location with the second highest average purchase amount
second_highest_location = merged_df.nlargest(2, "Purchase Amount ($)").iloc[-1]

# Create a scatter mapbox using the average purchase amount
fig = px.scatter_mapbox(
    merged_df, 
    lat='Latitude', 
    lon='Longitude', 
    size='Purchase Amount ($)',  # Use average purchase amount for bubble size
    color='Purchase Amount ($)',  # Use average purchase amount for color
    hover_name='Location',
    hover_data={'Latitude': False, 'Longitude': False, 'Purchase Amount ($)': True},
    title='Average Purchase Amount Heatmap by Location in Bangladesh',
    zoom=6,  # Zoom level focused on Bangladesh
    center={'lat': 23.685, 'lon': 90.3563},  # Center of Bangladesh
    height=500
)

# Add a marker and annotation for the second-highest location using graph_objects
fig.add_trace(go.Scattermapbox(
    lat=[second_highest_location['Latitude']],
    lon=[second_highest_location['Longitude']],
    mode='markers+text',
    marker=go.scattermapbox.Marker(size=14, color='red'),
    text=f"2nd Highest: {second_highest_location['Location']}",
    textposition="bottom right",
    showlegend=False
    
))

# Use open-street-map for the background map style
fig.update_layout(mapbox_style="open-street-map")

# Show the map
fig.show()


## **Level 3: Critical Thinking Insights**

### Q1: What factors contribute most to a customer being classified as a return customer?


In [80]:
df[df["Return Customer"]==True].head(2)

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Z-Score,Latitude,Longitude
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,-1.052804,23.8103,90.4125
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,1.606207,22.8456,89.5403


In [82]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# new DataFrame for visualization  
visualization_df = df.copy()

# Bin Age
visualization_df['Age Category'] = pd.cut(visualization_df['Age'], bins=[0, 20, 30, 40, 50, 60, 100], labels=['0-20', '21-30', '31-40', '41-50', '51-60', '60+'])

# Bin Time Spent on Website
visualization_df['Time Spent Category'] = pd.cut(visualization_df['Time Spent on Website (min)'], bins=[0, 5, 10, 20, 30, 60, float('inf')], labels=['0-5', '6-10', '11-20', '21-30', '31-60', '60+'])

# Bin the 'Purchase Amount ($)' into categories for better visualization
bins = [0, 50, 100, 200, 500, 1000, 2000, 5000, float('inf')]
labels = ['0-50', '50-100', '100-200', '200-500', '500-1000', '1000-2000', '2000-5000', '5000+']
visualization_df['Purchase Amount Category'] = pd.cut(visualization_df['Purchase Amount ($)'], bins=bins, labels=labels, right=False)

# List of columns excluding the new categories
non_categorical_columns = ['Gender', 'Location', 'Product Category', 'Device Type', 
                           'Payment Method', 'Discount Availed', 'Number of Items Purchased', 
                           'Review Score (1-5)', 'Delivery Time (days)', 'Subscription Status', 
                           'Customer Satisfaction']

# Create a Plotly figure with subplots for each factor (3 insights per row)
num_columns = 3
num_rows = (len(non_categorical_columns) + 3) // num_columns + 1  # +1 for additional factors

# Create the subplots
fig = make_subplots(rows=num_rows, cols=num_columns, 
                    subplot_titles=['Age Category', 'Time Spent Category', 'Purchase Amount Category'] + non_categorical_columns,
                    vertical_spacing=0.1)

# Age Category
age_data = visualization_df.groupby('Age Category')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=age_data['Age Category'], y=age_data['Return Customer'], name='Age'), row=1, col=1)

# Time Spent Category
time_data = visualization_df.groupby('Time Spent Category')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=time_data['Time Spent Category'], y=time_data['Return Customer'], name='Time Spent'), row=1, col=2)

# Purchase Amount Category
purchase_data = visualization_df.groupby('Purchase Amount Category')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=purchase_data['Purchase Amount Category'], y=purchase_data['Return Customer'], name='Purchase Amount'), row=1, col=3)

# Other Factors
for i, column in enumerate(non_categorical_columns):
    row = (i // num_columns) + 2  # Start from the second row
    col = (i % num_columns) + 1
    grouped_data = visualization_df.groupby(column)['Return Customer'].mean().reset_index()
    fig.add_trace(
        go.Bar(x=grouped_data[column], y=grouped_data['Return Customer'], name=column),
        row=row, col=col
    )
    fig.update_xaxes(title_text=column, row=row, col=col)
    fig.update_yaxes(title_text='Return Rate', row=row, col=col)

fig.update_layout(height=1800, width=1200, title_text="Return Rate by Various Factors", showlegend=False)

# Display the plot
fig.show()




In [182]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# New DataFrame for visualization  
visualization_df = df.copy()

# Bin Age
visualization_df['Age Group'] = pd.cut(visualization_df['Age'], bins=[0, 20, 30, 40, 50, 60, 100], labels=['0-20', '21-30', '31-40', '41-50', '51-60', '60+'])

# Bin Time Spent on Website
visualization_df['Website Engagement Time Group'] = pd.cut(visualization_df['Time Spent on Website (min)'], bins=[0, 5, 10, 20, 30, 60, float('inf')], labels=['0-5', '6-10', '11-20', '21-30', '31-60', '60+'])

# Bin the 'Purchase Amount ($)' into categories for better visualization
bins = [0, 50, 100, 200, 500, 1000, 2000, 5000, float('inf')]
labels = ['0-50', '50-100', '100-200', '200-500', '500-1000', '1000-2000', '2000-5000', '5000+']
visualization_df['Purchase Amount Group'] = pd.cut(visualization_df['Purchase Amount ($)'], bins=bins, labels=labels, right=False)

# List of columns excluding the new categories
factors_columns = ['Gender', 'Location', 'Product Category', 'Device Type', 
                   'Payment Method', 'Discount Availed', 'Number of Items Purchased', 
                   'Review Score (1-5)', 'Delivery Time (days)', 'Subscription Status', 
                   'Customer Satisfaction']

# Create a Plotly figure with subplots for each factor (3 insights per row)
num_columns = 3
num_rows = (len(factors_columns) + 3) // num_columns + 1  # +1 for additional factors

# Create the subplots
fig = make_subplots(rows=num_rows, cols=num_columns, 
                    subplot_titles=['Age Group', 'Website Engagement Time Group', 'Purchase Amount Group'] + factors_columns,
                    vertical_spacing=0.1)

# Age Group
age_data = visualization_df.groupby('Age Group')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=age_data['Age Group'], y=age_data['Return Customer'], name='Age Group'), row=1, col=1)

# Website Engagement Time Group
time_data = visualization_df.groupby('Website Engagement Time Group')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=time_data['Website Engagement Time Group'], y=time_data['Return Customer'], name='Website Engagement'), row=1, col=2)

# Purchase Amount Group
purchase_data = visualization_df.groupby('Purchase Amount Group')['Return Customer'].mean().reset_index()
fig.add_trace(go.Bar(x=purchase_data['Purchase Amount Group'], y=purchase_data['Return Customer'], name='Purchase Amount'), row=1, col=3)

# Other Factors
for i, column in enumerate(factors_columns):
    row = (i // num_columns) + 2  # Start from the second row
    col = (i % num_columns) + 1
    grouped_data = visualization_df.groupby(column)['Return Customer'].mean().reset_index()
    fig.add_trace(
        go.Bar(x=grouped_data[column], y=grouped_data['Return Customer'], name=column),
        row=row, col=col
    )
    fig.update_xaxes(title_text=column, row=row, col=col)
    fig.update_yaxes(title_text='Return Rate', row=row, col=col)

fig.update_layout(height=1800, width=1200, title_text="Return Rate by Various Factors", showlegend=False)

# Display the plot
fig.show()


### Q2: How do payment methods influence customer satisfaction and return rates?

In [83]:
# Calculate the percentage of each satisfaction level for each payment method
satisfaction_distribution = df.groupby(['Payment Method', 'Customer Satisfaction']).size().unstack(fill_value=0)
satisfaction_distribution = satisfaction_distribution.div(satisfaction_distribution.sum(axis=1), axis=0)

# Calculate return rates
return_rates = df.groupby('Payment Method')['Return Customer'].mean()

# Create subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add stacked bar chart for satisfaction levels
for satisfaction_level in ['Low', 'Medium', 'High']:
    fig.add_trace(
        go.Bar(x=satisfaction_distribution.index, 
               y=satisfaction_distribution[satisfaction_level], 
               name=f'Satisfaction: {satisfaction_level}',
               text=[f'{val:.1%}' for val in satisfaction_distribution[satisfaction_level]],
               textposition='inside'),
        secondary_y=False
    )

# Add line chart for return rates
fig.add_trace(
    go.Scatter(x=return_rates.index, y=return_rates.values, 
               name='Return Rate', mode='lines+markers',
               line=dict(color='red', width=2),
               marker=dict(size=8),
               text=[f'{val:.2%}' for val in return_rates.values],
               textposition='top center'),
    secondary_y=True
)

# Update layout
fig.update_layout(
    title='Customer Satisfaction Distribution and Return Rates by Payment Method',
    barmode='stack',
    xaxis_title='Payment Method',
    yaxis_title='Proportion of Customers',
    yaxis2_title='Return Rate',
    legend_title='Metrics'
)

fig.update_yaxes(tickformat='.0%', secondary_y=False)
fig.update_yaxes(tickformat='.0%', secondary_y=True)

fig.show()

# Print the data for reference
print(satisfaction_distribution)
print("\
Return Rates:")
print(return_rates)

Customer Satisfaction      High       Low    Medium
Payment Method                                     
Bank Transfer          0.339623  0.328012  0.332366
Cash on Delivery       0.312407  0.350274  0.337319
Credit Card            0.346647  0.343688  0.309665
Debit Card             0.336863  0.334846  0.328290
PayPal                 0.315405  0.325326  0.359269
Return Rates:
Payment Method
Bank Transfer       0.506047
Cash on Delivery    0.489287
Credit Card         0.512327
Debit Card          0.491175
PayPal              0.498695
Name: Return Customer, dtype: float64


### Q3: How does the location influence both purchase amount and delivery time?


In [302]:
import pandas as pd
import plotly.express as px


# Group by location and calculate metrics
location_analysis = df.groupby('Location').agg({
    'Purchase Amount ($)': 'mean',
    'Delivery Time (days)': 'mean'
}).reset_index()

# Rename columns for clarity
location_analysis.columns = ['Location', 'Avg Purchase Amount ($)', 'Avg Delivery Time (days)']

# Create a bubble chart with size based on Average Purchase Amount
fig = px.scatter(location_analysis, 
                 x='Avg Purchase Amount ($)', 
                 y='Avg Delivery Time (days)',
                 size='Avg Delivery Time (days)',  # Size based on average purchase amount
                 color='Location',
                 hover_name='Location',
                 text='Location',
                 size_max=30,
                 title='Location Impact on Purchase Amount and Delivery Time')

# Update layout for better readability
fig.update_traces(textposition='top center')
fig.update_layout(
    height=600, 
    width=900,
    xaxis_title='Average Purchase Amount ($)',
    yaxis_title='Average Delivery Time (days)',
    legend_title='Location',
    title_x=0.5,  # Center the title
    title_font=dict(size=18),  # Increase title font size
    xaxis=dict(showgrid=True, gridcolor='LightGray'),  # Add grid lines
    yaxis=dict(showgrid=True, gridcolor='LightGray')
)

# Show the plot
fig.show()

# Display the data
print(location_analysis)


     Location  Avg Purchase Amount ($)  Avg Delivery Time (days)
0     Barisal               513.674560                  7.072800
1  Chittagong               507.851675                  7.020440
2       Dhaka               502.002504                  6.941957
3      Khulna               513.937072                  6.811774
4  Mymensingh               507.892211                  7.099219
5    Rajshahi               495.544649                  7.113130
6     Rangpur               494.368867                  7.031125
7      Sylhet               494.976427                  7.018593


In [186]:
# Group by location and calculate metrics
location_analysis = df.groupby('Location').agg({
    'Purchase Amount ($)': 'mean',
    'Delivery Time (days)': 'mean'
}).reset_index()

# Rename columns for clarity
location_analysis.columns = ['Location', 'Avg Purchase Amount ($)', 'Avg Delivery Time (days)']

# Create a bubble chart with size based on Average Delivery Time
fig = px.scatter(location_analysis, 
                 x='Avg Purchase Amount ($)', 
                 y='Avg Delivery Time (days)',
                 size='Avg Delivery Time (days)',  # Size based on average delivery time
                 color='Location',
                 hover_name='Location',
                 text='Location',
                 size_max=30,
                 title='Location Impact on Purchase Amount and Delivery Time',
                 color_continuous_scale=px.colors.qualitative.Set2)  # Choose a color scale

# Update traces for better visibility
fig.update_traces(textposition='top center', 
                  marker=dict(line=dict(width=1, color='DarkSlateGrey')))  # Add border around bubbles

# Update layout for better readability
fig.update_layout(
    height=600, 
    width=900,
    xaxis_title='Average Purchase Amount ($)',
    yaxis_title='Average Delivery Time (days)',
    legend_title='Location',
    title_x=0.5,  # Center the title
    title_font=dict(size=18),  # Increase title font size
    xaxis=dict(showgrid=True, gridcolor='LightGray'),  # Add grid lines
    yaxis=dict(showgrid=True, gridcolor='LightGray')
)

# Show the plot
fig.show()

# Display the data
print(location_analysis)


     Location  Avg Purchase Amount ($)  Avg Delivery Time (days)
0     Barisal               513.674560                  7.072800
1  Chittagong               507.851675                  7.020440
2       Dhaka               502.002504                  6.941957
3      Khulna               513.937072                  6.811774
4  Mymensingh               507.892211                  7.099219
5    Rajshahi               495.544649                  7.113130
6     Rangpur               494.368867                  7.031125
7      Sylhet               494.976427                  7.018593


## **Own Findings!**

In [306]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Location,Product Category,Purchase Amount ($),Time Spent on Website (min),Device Type,Payment Method,Discount Availed,Number of Items Purchased,Return Customer,Review Score (1-5),Delivery Time (days),Subscription Status,Customer Satisfaction,Latitude,Longitude
0,1,20,Other,Dhaka,Toys,202.54,44,Mobile,Cash on Delivery,True,4,True,5,4,Free,Low,23.8103,90.4125
1,2,39,Male,Rangpur,Sports,655.94,27,Desktop,Bank Transfer,True,1,False,1,7,Free,Medium,25.7439,89.2752
2,3,64,Male,Khulna,Home,963.65,9,Tablet,Bank Transfer,False,8,True,4,9,Premium,Medium,22.8456,89.5403
3,4,65,Other,Rajshahi,Beauty,485.59,39,Desktop,Bank Transfer,True,1,True,5,10,Trial,Medium,24.3745,88.6042
4,5,67,Male,Rangpur,Home,143.27,17,Tablet,Debit Card,False,8,True,4,1,Premium,High,25.7439,89.2752


### Q1.What are the spending habits of different age groups, and how does this influence their return rates?

In [203]:
# Create age groups
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 20, 30, 40, 50, 60, 100], labels=['0-20', '21-30', '31-40', '41-50', '51-60', '60+'])

# Calculate average spending and return rate for each age group
age_group_stats = df.groupby('Age_Group').agg({
    'Purchase Amount ($)': 'mean',
    'Return Customer': 'mean'
}).reset_index()

age_group_stats['Return Rate'] = age_group_stats['Return Customer'] * 100

print("Age Group Statistics:")
print(age_group_stats)

Age Group Statistics:
  Age_Group  Purchase Amount ($)  Return Customer  Return Rate
0      0-20           490.023462         0.504394    50.439367
1     21-30           511.457523         0.506153    50.615302
2     31-40           503.822736         0.505313    50.531350
3     41-50           502.723146         0.488690    48.869016
4     51-60           502.094620         0.503945    50.394477
5       60+           503.751285         0.491719    49.171902


In [309]:
# Create the figure
fig = go.Figure()

# Add bar chart for average purchase amount
fig.add_trace(go.Bar(
    x=age_group_stats['Age_Group'],
    y=age_group_stats['Purchase Amount ($)'],
    name='Avg. Purchase Amount',
    yaxis='y',
    offsetgroup=1
))

# Add line chart for return rate
fig.add_trace(go.Scatter(
    x=age_group_stats['Age_Group'],
    y=age_group_stats['Return Rate'],
    name='Return Rate',
    yaxis='y2',
    mode='lines+markers'
))

# Update the layout
fig.update_layout(
    title='Spending Habits and Return Rates by Age Group',
    xaxis=dict(title='Age Group'),
    yaxis=dict(title='Average Purchase Amount ($)', side='left', showgrid=False),
    yaxis2=dict(title='Return Rate (%)', side='right', overlaying='y', showgrid=False),
    legend=dict(x=1.1, y=1, bgcolor='rgba(255, 255, 255, 0.5)'),
    barmode='group',
    height=600,
    width=1000
)

# Show the plot
fig.show()

# Save the plot as an HTML file
#fig.write_html("combined_spending_habits_return_rates.html")
#print("Combined visualization saved as combined_spending_habits_return_rates.html")

### Q2.How do product preferences vary across different locations?

In [310]:
import pandas as pd
import plotly.express as px



# Get unique locations and product categories
locations = df['Location'].unique()
product_categories = df['Product Category'].unique()

print("Unique Locations:", locations)
print("\
Unique Product Categories:", product_categories)

# Create a cross-tabulation of Location and Product Category
location_product_counts = pd.crosstab(df['Location'], df['Product Category'])

# Calculate the percentage of each product category within each location
location_product_percentages = location_product_counts.div(location_product_counts.sum(axis=1), axis=0) * 100

# Create a heatmap
fig = px.imshow(location_product_percentages,
                labels=dict(x="Product Category", y="Location", color="Percentage"),
                x=location_product_percentages.columns,
                y=location_product_percentages.index,
                color_continuous_scale="Viridis")

fig.update_layout(title="Product Category Preferences by Location",
                  xaxis_title="Product Category",
                  yaxis_title="Location")

# Show the plot
fig.show()

# Save the plot as an HTML file
#fig.write_html("product_preferences_by_location.html")
#print("Visualization saved as product_preferences_by_location.html")

# Display the percentage table
#print("\
#Product Category Percentages by Location:")
#print(location_product_percentages.round(2))

Unique Locations: ['Dhaka' 'Rangpur' 'Khulna' 'Rajshahi' 'Sylhet' 'Mymensingh' 'Barisal'
 'Chittagong']
Unique Product Categories: ['Toys' 'Sports' 'Home' 'Beauty' 'Books' 'Groceries' 'Electronics'
 'Clothing']


In [316]:
# Get unique locations and product categories
locations = df['Location'].unique()
product_categories = df['Product Category'].unique()

# Create a cross-tabulation of Location and Product Category
location_product_counts = pd.crosstab(df['Location'], df['Product Category'])

# Calculate the percentage of each product category within each location
location_product_percentages = location_product_counts.div(location_product_counts.sum(axis=1), axis=0) * 100

# Create a heatmap with annotations
fig = px.imshow(location_product_percentages,
                labels=dict(x="Product Category", y="Location", color="Percentage"),
                x=location_product_percentages.columns,
                y=location_product_percentages.index,
                color_continuous_scale="Viridis",
                text_auto=True)  # Add this line to show annotations

fig.update_layout(title="Product Category Preferences by Location",
                  xaxis_title="Product Category",
                  yaxis_title="Location",
                  height=1000, width=1000)



# Show the plot
fig.show()


Unique Locations: ['Dhaka' 'Rangpur' 'Khulna' 'Rajshahi' 'Sylhet' 'Mymensingh' 'Barisal'
 'Chittagong']
Unique Product Categories: ['Toys' 'Sports' 'Home' 'Beauty' 'Books' 'Groceries' 'Electronics'
 'Clothing']


In [324]:
df.columns

Index(['Customer ID', 'Age', 'Gender', 'Location', 'Product Category',
       'Purchase Amount ($)', 'Time Spent on Website (min)', 'Device Type',
       'Payment Method', 'Discount Availed', 'Number of Items Purchased',
       'Return Customer', 'Review Score (1-5)', 'Delivery Time (days)',
       'Subscription Status', 'Customer Satisfaction'],
      dtype='object')

### Q3. How do purchasing patterns differ between genders in terms of product categories and average spend?

In [204]:
# Group the data by Product Category and Gender, then calculate the average purchase amount
product_gender_stats = df.groupby(['Product Category', 'Gender'])['Purchase Amount ($)'].mean().unstack()

# Sort the product categories by the total average purchase amount across genders
sorted_product_gender_stats = product_gender_stats.loc[
    product_gender_stats.mean(axis=1).sort_values(ascending=False).index
]

# Create a bar plot
fig = px.bar(sorted_product_gender_stats, 
             title='Average Spending by Product Category and Gender',
             labels={'value': 'Average Purchase Amount ($)', 'Product Category': 'Product Category'},
             barmode='group'
             )  

# Update layout for better readability
fig.update_layout(
    xaxis_title='Product Category',
    yaxis_title='Average Purchase Amount ($)',
    xaxis_tickangle=-45,
    legend_title='Gender',
    template='plotly_white',  # Change the template for a cleaner look
    height=600  # Increase height for better visibility
)

# Show the plot
fig.show()




In [328]:
# Sort each product category by the highest average spending for any gender
sorted_product_gender_stats = product_gender_stats.apply(lambda x: x.sort_values(ascending=False), axis=1)

# Create a bar plot with the updated sorting
fig = px.bar(sorted_product_gender_stats, 
             title='Average Spending by Product Category and Gender (Individually Sorted)',
             labels={'value': 'Average Purchase Amount ($)', 'Product Category': 'Product Category'},
             barmode='group')

# Update layout for better readability
fig.update_layout(xaxis_tickangle=-45)

# Show the plot
fig.show()



Individually Sorted Average Spending by Product Category and Gender:
Gender            Female    Male   Other
Product Category                        
Beauty            504.90  488.00  479.77
Books             511.54  530.11  528.04
Clothing          490.49  498.16  507.21
Electronics       490.06  485.27  493.22
Groceries         499.70  504.00  516.46
Home              519.68  512.94  502.96
Sports            491.64  496.98  491.81
Toys              516.52  522.96  510.30
