# Amazon Consumer Behaviour EDA

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#### Loading the Dataset

In [4]:
df = pd.read_csv('Amazon_Survey.csv')

In [5]:
df.head(5)

Unnamed: 0,Timestamp,age,Gender,Purchase_Frequency,Purchase_Categories,Personalized_Recommendation_Frequency,Browsing_Frequency,Product_Search_Method,Search_Result_Exploration,Customer_Reviews_Importance,...,Saveforlater_Frequency,Review_Left,Review_Reliability,Review_Helpfulness,Personalized_Recommendation_Frequency.1,Recommendation_Helpfulness,Rating_Accuracy,Shopping_Satisfaction,Service_Appreciation,Improvement_Areas
0,2023/06/04 1:28:19 PM GMT+5:30,23,Female,Few times a month,Beauty and Personal Care,Yes,Few times a week,Keyword,Multiple pages,1,...,Sometimes,Yes,Occasionally,Yes,2,Yes,1,1,Competitive prices,Reducing packaging waste
1,2023/06/04 2:30:44 PM GMT+5:30,23,Female,Once a month,Clothing and Fashion,Yes,Few times a month,Keyword,Multiple pages,1,...,Rarely,No,Heavily,Yes,2,Sometimes,3,2,Wide product selection,Reducing packaging waste
2,2023/06/04 5:04:56 PM GMT+5:30,24,Prefer not to say,Few times a month,Groceries and Gourmet Food;Clothing and Fashion,No,Few times a month,Keyword,Multiple pages,2,...,Rarely,No,Occasionally,No,4,No,3,3,Competitive prices,Product quality and accuracy
3,2023/06/04 5:13:00 PM GMT+5:30,24,Female,Once a month,Beauty and Personal Care;Clothing and Fashion;...,Sometimes,Few times a month,Keyword,First page,5,...,Sometimes,Yes,Heavily,Yes,3,Sometimes,3,4,Competitive prices,Product quality and accuracy
4,2023/06/04 5:28:06 PM GMT+5:30,22,Female,Less than once a month,Beauty and Personal Care;Clothing and Fashion,Yes,Few times a month,Filter,Multiple pages,1,...,Rarely,No,Heavily,Yes,4,Yes,2,2,Competitive prices,Product quality and accuracy


#### Displaying the total number of rows and columns in the dataset

In [6]:
rows, columns = df.shape
print(f'This dataset comatins {rows} rows and {columns} columns')

This dataset comatins 602 rows and 23 columns


### Displaying the column names

In [7]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Personalized_Recommendation_Frequency',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Personalized_Recommendation_Frequency ', 'Recommendation_Helpfulness',
       'Rating_Accuracy ', 'Shopping_Satisfaction', 'Service_Appreciation',
       'Improvement_Areas'],
      dtype='str')

#### Based on the publicly available Amazon metadata, the meaning of each column in the dataset is as follows:

- age: age
- gender: gender
- Purchase_Frequency: How frequently do you make purchases on Amazon?
- Purchase_Categories: What product categories do you typically purchase on Amazon?
- Personalized_Recommendation_Frequency: Have you ever made a purchase based on personalized product recommendations from Amazon?
- Browsing_Frequency: How often do you browse Amazon's website or app?
- Product_Search_Method: How do you search for products on Amazon?
- Search_Result_Exploration: Do you tend to explore multiple pages of search results or focus on the first page?
- Customer_Reviews_Importance: How important are customer reviews in your decision-making process?
- Add_to_Cart_Browsing: Do you add products to your cart while browsing on Amazon?
- Cart_Completion_Frequency: How often do you complete the purchase after adding products to your cart?
- Cart_Abandonment_Factors: What factors influence your decision to abandon a purchase in your cart?
- Saveforlater_Frequency: Do you use Amazon's "Save for Later" feature, and if so, how often?
- Review_Left: Have you ever left a product review on Amazon?
- Review_Reliability: How much do you rely on product reviews when making a purchase?
- Review_Helpfulness: Do you find helpful information from other customers' reviews?
- Personalized_Recommendation_Frequency: How often do you receive personalized product recommendations from Amazon?
- Recommendation_Helpfulness: Do you find the recommendations helpful?
- Rating_Accuracy: How would you rate the relevance and accuracy of the recommendations you receive
- Shopping_Satisfaction: How satisfied are you with your overall shopping experience on Amazon?
- Service_Appreciation: What aspects of Amazon's services do you appreciate the most?
- Improvement_Areas: Are there any areas where you think Amazon can improve?



In [8]:
df.describe()

Unnamed: 0,age,Customer_Reviews_Importance,Personalized_Recommendation_Frequency,Rating_Accuracy,Shopping_Satisfaction
count,602.0,602.0,602.0,602.0,602.0
mean,30.790698,2.480066,2.699336,2.672757,2.463455
std,10.193276,1.185226,1.042028,0.899744,1.012152
min,3.0,1.0,1.0,1.0,1.0
25%,23.0,1.0,2.0,2.0,2.0
50%,26.0,3.0,3.0,3.0,2.0
75%,36.0,3.0,3.0,3.0,3.0
max,67.0,5.0,5.0,5.0,5.0


In [9]:
df.dtypes

Timestamp                                   str
age                                       int64
Gender                                      str
Purchase_Frequency                          str
Purchase_Categories                         str
Personalized_Recommendation_Frequency       str
Browsing_Frequency                          str
Product_Search_Method                       str
Search_Result_Exploration                   str
Customer_Reviews_Importance               int64
Add_to_Cart_Browsing                        str
Cart_Completion_Frequency                   str
Cart_Abandonment_Factors                    str
Saveforlater_Frequency                      str
Review_Left                                 str
Review_Reliability                          str
Review_Helpfulness                          str
Personalized_Recommendation_Frequency     int64
Recommendation_Helpfulness                  str
Rating_Accuracy                           int64
Shopping_Satisfaction                   

#### Obersvations on Data Types Clearly Some columns in the dataset are **not in the proper data type** for analysis.

- Timestamp is stored as str but should be converted to datetime.
- Personalized_Recommendation_Frequency appears twice with different data types (str and int64). But as per the data description, these two columns serve different purposes. So we will have to rename them to differentiate it accordingly.
- The columns - Customer_Reviews_Importance, Personalized_Recommendation_Frequency, Rating_Accuracy & Shopping_Satisfaction, even if they are present in int64, but these are essentially categorical varaibles.  

These columns will need **cleaning and type conversion** before the analysis.

#### Checking all the unique values of each of the column

In [10]:
for col in df.columns:
    unique_val = df[col].unique()
    print(f"Column Name : {col}")
    print(f'Number of unique values : {len(unique_val)}')
    print(f'Unique Values : {unique_val}')
    print('\n')


Column Name : Timestamp
Number of unique values : 601
Unique Values : <StringArray>
['2023/06/04 1:28:19 PM GMT+5:30', '2023/06/04 2:30:44 PM GMT+5:30',
 '2023/06/04 5:04:56 PM GMT+5:30', '2023/06/04 5:13:00 PM GMT+5:30',
 '2023/06/04 5:28:06 PM GMT+5:30', '2023/06/04 6:01:59 PM GMT+5:30',
 '2023/06/04 6:31:41 PM GMT+5:30', '2023/06/04 7:13:12 PM GMT+5:30',
 '2023/06/04 7:23:21 PM GMT+5:30', '2023/06/04 7:33:12 PM GMT+5:30',
 ...
 '2023/06/12 3:56:57 PM GMT+5:30', '2023/06/12 3:57:52 PM GMT+5:30',
 '2023/06/12 3:59:10 PM GMT+5:30', '2023/06/12 3:59:59 PM GMT+5:30',
 '2023/06/12 4:00:56 PM GMT+5:30', '2023/06/12 4:02:02 PM GMT+5:30',
 '2023/06/12 4:02:53 PM GMT+5:30', '2023/06/12 4:03:59 PM GMT+5:30',
 '2023/06/12 9:57:20 PM GMT+5:30', '2023/06/16 9:16:05 AM GMT+5:30']
Length: 601, dtype: str


Column Name : age
Number of unique values : 50
Unique Values : [23 24 22 21 20 25 16 64 29 19 26 32 30 40 36 31 47 54 58 53 28 55 62 27
 34 44 38 35 42 37 45 50 63 46 33 60 18 17 57 41 39 48 49 1

#### Summary from the observations

- Columns with correct types:
age, Gender, Purchase_Categories, Product_Search_Method, Search_Result_Exploration, Customer_Reviews_Importance, Add_to_Cart_Browsing, Cart_Abandonment_Factors, Review_Left, Service_Appreciation, Improvement_Areas, Rating_Accuracy, Shopping_Satisfaction are mostly fine.

- Columns needing conversions/cleaning:
Timestamp (stored as str instead of datetime) needs to be converted for time-based analysis.
Personalized_Recommendation_Frequency appears twice with different data types (str and int64) and must be resolved (rename or remove duplication).

- Columns with potential considerations (ordinal categorical variables):
Purchase_Frequency, Browsing_Frequency, Cart_Completion_Frequency, Saveforlater_Frequency, Review_Reliability, Review_Helpfulness, Recommendation_Helpfulness - these are categorical but represent frequency/degree (Never → Always, Occasionally → Heavily). Consider marking them as ordered categorical for meaningful analysis.

- Columns with potential anomalies / considerations:
    - Purchase_Categories: multiple categories per entry separated by ;, may require splitting or encoding for analysis.
    - Service_Appreciation: entries like . or duplicates (Customer service vs Customer service) may need cleaning.
    - Improvement_Areas: contains minor variations, punctuation differences, or long strings; may require grouping or text cleaning.
    - In the Improvement_Areas column there is a row which contains '.' which is certainly an error and can be replace by Mode


#### Checking for the missing values per column (uncluding empty strings)

In [11]:
nan_count = df.isnull().sum()

empty_count = (df == '').sum()

total_missing = nan_count + empty_count

missing_percent = (total_missing/len(df))*100

missing_df = pd.DataFrame({
    'Missing Values': total_missing,
    'Percentage (%)' : missing_percent
}).sort_values(by='Percentage (%)', ascending=True)

print(missing_df)

                                        Missing Values  Percentage (%)
Timestamp                                            0        0.000000
age                                                  0        0.000000
Gender                                               0        0.000000
Purchase_Frequency                                   0        0.000000
Purchase_Categories                                  0        0.000000
Personalized_Recommendation_Frequency                0        0.000000
Browsing_Frequency                                   0        0.000000
Search_Result_Exploration                            0        0.000000
Cart_Abandonment_Factors                             0        0.000000
Customer_Reviews_Importance                          0        0.000000
Add_to_Cart_Browsing                                 0        0.000000
Cart_Completion_Frequency                            0        0.000000
Review_Left                                          0        0.000000
Savefo

## Data Cleaning

- Since Product_Search_Method is a categorical varaibles, we'll replace the 2 missing values using - Mode imputation
- Timestamp Column will be converted to the respective data type
- For Personalized_Recommendation_Frequency. There two features with the same name. But as per the data description, these two columns server different purposes. So we will have to rename them to differentiate it accordingly

##### Firstly, lets convert the data type for 'time stamp'

In [12]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce', utc=True)

  df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce', utc=True)


In [13]:
df['Timestamp'].dtype

datetime64[us, UTC]

#### Mode Imputation for Product_Search_Method and Improvement_Areas

In [14]:
df['Product_Search_Method'] = df['Product_Search_Method'].fillna(df['Product_Search_Method'].mode()[0])

cols = ['Improvement_Areas', 'Service_Appreciation' ]

junk_values = [
    '.', 'Nil', 'Nothing',
    "I don't have any problem with Amazon",
    'No problems with Amazon'
]

for col in cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .replace(junk_values, pd.NA)
    )

df['Service_Appreciation'] = df['Service_Appreciation'].fillna(
    df['Service_Appreciation'].mode()[0]
)

df['Improvement_Areas'] = df['Improvement_Areas'].fillna(
    df['Improvement_Areas'].mode()[0]
)

In [15]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Personalized_Recommendation_Frequency',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Personalized_Recommendation_Frequency ', 'Recommendation_Helpfulness',
       'Rating_Accuracy ', 'Shopping_Satisfaction', 'Service_Appreciation',
       'Improvement_Areas'],
      dtype='str')

In [16]:
df = df.rename(columns={
    'Personalized_Recommendation_Frequency ': 'Number_of_times_Personalized_Recommendation_Received',
    'Personalized_Recommendation_Frequency': 'Purchase_made_on_Personalized_Recommendation',
    'Rating_Accuracy ': 'Rating_Accuracy'
})

In [17]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Purchase_made_on_Personalized_Recommendation',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Number_of_times_Personalized_Recommendation_Received',
       'Recommendation_Helpfulness', 'Rating_Accuracy',
       'Shopping_Satisfaction', 'Service_Appreciation', 'Improvement_Areas'],
      dtype='str')

In [18]:
# List of categorical/ordinal columns in the original DataFrame
categorical_columns = [
    'Gender',
    'Purchase_Frequency',
    'Purchase_Categories',
    'Purchase_made_on_Personalized_Recommendation',
    'Browsing_Frequency',
    'Product_Search_Method',
    'Search_Result_Exploration',
    'Add_to_Cart_Browsing',
    'Cart_Completion_Frequency',
    'Cart_Abandonment_Factors',
    'Saveforlater_Frequency',
    'Review_Left',
    'Review_Reliability',
    'Review_Helpfulness',
    'Recommendation_Helpfulness',
    'Service_Appreciation',
    'Improvement_Areas',
    'Customer_Reviews_Importance',
    'Number_of_times_Personalized_Recommendation_Received',
    'Rating_Accuracy',
    'Shopping_Satisfaction'
]

# Convert the columns in-place in the original df
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Verify conversion
print(df.dtypes)


Timestamp                                               datetime64[us, UTC]
age                                                                   int64
Gender                                                             category
Purchase_Frequency                                                 category
Purchase_Categories                                                category
Purchase_made_on_Personalized_Recommendation                       category
Browsing_Frequency                                                 category
Product_Search_Method                                              category
Search_Result_Exploration                                          category
Customer_Reviews_Importance                                        category
Add_to_Cart_Browsing                                               category
Cart_Completion_Frequency                                          category
Cart_Abandonment_Factors                                           category
Saveforlater

# Time for Analysis

### Univariate Analysis

In [19]:
import plotly.express as px

# -------------------------
# CATEGORICAL VARIABLES
# -------------------------
categorical_cols = [
    'Gender', 
    'Purchase_Frequency',
    'Purchase_Categories',
    'Purchase_made_on_Personalized_Recommendation',
    'Browsing_Frequency',
    'Product_Search_Method',
    'Search_Result_Exploration',
    'Customer_Reviews_Importance',
    'Add_to_Cart_Browsing',
    'Cart_Completion_Frequency',
    'Cart_Abandonment_Factors',
    'Saveforlater_Frequency',
    'Review_Reliability',
    'Review_Helpfulness',
    'Number_of_times_Personalized_Recommendation_Received',
    'Recommendation_Helpfulness',
    'Rating_Accuracy',
    'Shopping_Satisfaction',
    'Service_Appreciation',
    'Improvement_Areas'
]

for col in categorical_cols:
    counts = (
        df[col]
        .value_counts()
        .rename_axis(col)
        .reset_index(name='Count')
    )
    fig = px.bar(
        counts,
        x='Count',
        y=col,
        orientation='h',
        text='Count',
        title=f'{col.replace("_"," ").title()} Distribution'
    )
    fig.update_layout(yaxis={'categoryorder':'total descending'})
    fig.update_traces(textposition='outside')
    fig.show()

In [20]:
# -------------------------
# NUMERIC VARIABLES
# -------------------------
numeric_cols = ['age']

for col in numeric_cols:
    fig = px.histogram(
        df,
        x=col,
        nbins=30,
        title=f'Distribution of {col.replace("_"," ").title()}',
        labels={col: col.replace("_"," ").title(), 'count':'Count'}
    )
    fig.update_traces(
        texttemplate='%{y}',
        textposition='outside',
        textfont_size=14
    )
    fig.update_layout(
        xaxis_title=col.replace("_"," ").title(),
        yaxis_title='Count',
        bargap=0.1
    )
    fig.show()

In [21]:
# -------------------------
# DATE VARIABLES
# -------------------------
date_cols = ['Timestamp']

for col in date_cols:
    fig = px.histogram(
        df,
        x=col,
        nbins=50,
        title=f'Distribution of {col.replace("_"," ").title()}',
        labels={col: col.replace("_"," ").title(), 'count':'Count'}
    )
    fig.update_traces(
        texttemplate='%{y}',
        textposition='outside',
        textfont_size=14
    )
    fig.show()

In [22]:
# -------------------------
# BOOLEAN / YES-NO VARIABLES
# -------------------------
bool_cols = [
    'Purchase_made_on_Personalized_Recommendation',
    'Review_Left'
]

for col in bool_cols:
    bool_counts = (
        df[col]
        .value_counts()
        .rename_axis(col.replace("_"," ").title())
        .reset_index(name='Count')
    )
    x_col = bool_counts.columns[0]
    fig = px.bar(
        bool_counts,
        x=x_col,
        y='Count',
        title=f'{col.replace("_"," ").title()} Distribution',
        text='Count'
    )
    fig.update_traces(textposition='outside')
    fig.show()

## Bivariate Analysis

In [None]:
# -------------------------
# Numeric vs Categorical
# -------------------------

# 1 Age vs Purchase Frequency (Boxplot)
fig = px.box(
    df,
    x='Purchase_Frequency',
    y='age',
    title='Age vs Purchase Frequency',
    labels={'Purchase_Frequency':'Purchase Frequency','age':'Age'}
)
fig.show()

# 2 Age vs Shopping Satisfaction (Boxplot)
fig = px.box(
    df,
    x='Shopping_Satisfaction',
    y='age',
    title='Age vs Shopping Satisfaction',
    labels={'Shopping_Satisfaction':'Shopping Satisfaction','age':'Age'}
)
fig.show()

# 3 Purchase Frequency vs Cart Completion Frequency (Boxplot)
fig = px.box(
    df,
    x='Cart_Completion_Frequency',
    y='Purchase_Frequency',
    title='Purchase Frequency vs Cart Completion Frequency',
    labels={'Cart_Completion_Frequency':'Cart Completion Frequency','Purchase_Frequency':'Purchase Frequency'}
)
fig.show()

In [None]:
# Categorical vs Categorical
# -------------------------

# 4 Personalized Recommendation vs Recommendation Helpfulness (Stacked Bar)
counts = df.groupby(['Purchase_made_on_Personalized_Recommendation','Recommendation_Helpfulness']).size().reset_index(name='Count')
fig = px.bar(
    counts,
    x='Purchase_made_on_Personalized_Recommendation',
    y='Count',
    color='Recommendation_Helpfulness',
    title='Personalized Recommendation vs Recommendation Helpfulness',
    text='Count'
)
fig.update_traces(textposition='outside')
fig.show()

# 5 Customer Reviews Importance vs Review Left (Stacked Bar)
counts = df.groupby(['Customer_Reviews_Importance','Review_Left']).size().reset_index(name='Count')
fig = px.bar(
    counts,
    x='Customer_Reviews_Importance',
    y='Count',
    color='Review_Left',
    title='Customer Reviews Importance vs Review Left',
    text='Count'
)
fig.update_traces(textposition='outside')
fig.show()

# 6 Cart Abandonment Factors vs Cart Completion Frequency (Stacked Bar)
counts = df.groupby(['Cart_Abandonment_Factors','Cart_Completion_Frequency']).size().reset_index(name='Count')
fig = px.bar(
    counts,
    x='Cart_Abandonment_Factors',
    y='Count',
    color='Cart_Completion_Frequency',
    title='Cart Abandonment Factors vs Cart Completion Frequency',
    text='Count'
)
fig.update_traces(textposition='outside')
fig.show()

In [25]:
# -------------------------
# CATEGORICAL vs CATEGORICAL
# -------------------------

# 1. Product Search Method vs Purchase Made on Personalized Recommendation
fig = px.histogram(
    df,
    x='Product_Search_Method',
    color='Purchase_made_on_Personalized_Recommendation',
    barmode='stack',
    title='Product Search Method vs Purchase Made on Personalized Recommendation'
)
fig.show()

# 2. Cart Abandonment Factors vs Age
fig = px.histogram(
    df,
    x='age',
    color='Cart_Abandonment_Factors',
    barmode='stack',
    title='Cart Abandonment Factors Across Age Groups'
)
fig.show()

In [None]:
# -------------------------
# Date vs Categorical
# -------------------------

# 7 Timestamp vs Purchase Frequency (Line Plot)
# First, resample by day
df['Date'] = df['Timestamp'].dt.date
daily_counts = df.groupby(['Date','Purchase_Frequency']).size().reset_index(name='Count')

fig = px.line(
    daily_counts,
    x='Date',
    y='Count',
    color='Purchase_Frequency',
    title='Daily Purchase Frequency Over Time',
    labels={'Count':'Number of Purchases','Date':'Date','Purchase_Frequency':'Purchase Frequency'}
)
fig.show()

## Multivaraite Analysis

In [None]:
import plotly.express as px

# -------------------------
# 1 Shopping Satisfaction by Purchase Frequency and Age (Grouped Boxplot)
# -------------------------
fig = px.box(
    df,
    x='Purchase_Frequency',
    y='age',
    color='Shopping_Satisfaction',
    title='Shopping Satisfaction by Purchase Frequency and Age',
    labels={'age':'Age','Purchase_Frequency':'Purchase Frequency','Shopping_Satisfaction':'Shopping Satisfaction'}
)

fig.update_layout(
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
    legend=dict(title_font_size=14, font_size=12)
)
fig.show()


# -------------------------
# 2 Personalized Recommendation × Recommendation Helpfulness × Purchase Frequency (Heatmap)
# -------------------------
heatmap_data = df.groupby(
    ['Purchase_made_on_Personalized_Recommendation','Recommendation_Helpfulness','Purchase_Frequency']
).size().reset_index(name='Count')

fig = px.density_heatmap(
    heatmap_data,
    x='Purchase_made_on_Personalized_Recommendation',
    y='Recommendation_Helpfulness',
    z='Count',
    facet_col='Purchase_Frequency',
    title='Personalized Recommendation × Recommendation Helpfulness × Purchase Frequency',
    labels={'Count':'Number of Users','Purchase_made_on_Personalized_Recommendation':'Personalized Recommendation','Recommendation_Helpfulness':'Recommendation Helpfulness'}
)

fig.update_layout(
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickfont_size=5,
    yaxis_tickfont_size=12,
    legend=dict(title_font_size=14, font_size=12)
)

# Customize facet subplot titles:
for annotation in fig.layout.annotations:
    if "Purchase_Frequency" in annotation.text:
        annotation.font.size = 16      # Make font larger
        annotation.font.color = "black"  # Make color black for clarity
        annotation.text = annotation.text.replace("Purchase_Frequency=", "")  # Clean label text

fig.show()



# -------------------------
# 3 Cart Abandonment Factors × Purchase Frequency × Cart Completion Frequency (Faceted Bar Charts)
# -------------------------
faceted_data = df.groupby(
    ['Cart_Abandonment_Factors','Purchase_Frequency','Cart_Completion_Frequency']
).size().reset_index(name='Count')

fig = px.bar(
    faceted_data,
    x='Cart_Abandonment_Factors',
    y='Count',
    color='Cart_Completion_Frequency',
    facet_col='Purchase_Frequency',
    title='Cart Abandonment Factors × Purchase Frequency × Cart Completion Frequency',
    text='Count'
)

fig.update_traces(textposition='outside')  # valid for bar charts

fig.update_layout(
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
    legend=dict(title_font_size=14, font_size=12)
)

# Customize facet subplot titles:
for annotation in fig.layout.annotations:
    if "Purchase_Frequency" in annotation.text:
        annotation.font.size = 16      # Increase font size
        annotation.font.color = "black"  # Change color for clarity
        annotation.text = annotation.text.replace("Purchase_Frequency=", "")  # Clean up text

fig.show()



# -------------------------
# 4 Review Left × Review Helpfulness × Shopping Satisfaction (Heatmap)
# -------------------------
heatmap_data2 = df.groupby(
    ['Review_Left','Review_Helpfulness','Shopping_Satisfaction']
).size().reset_index(name='Count')

fig = px.density_heatmap(
    heatmap_data2,
    x='Review_Left',
    y='Review_Helpfulness',
    z='Count',
    facet_col='Shopping_Satisfaction',
    title='Review Left vs Review Helpfulness vs Shopping Satisfaction',
    labels={'Count':'Number of Users','Review_Left':'Review Left','Review_Helpfulness':'Review Helpfulness'}
)

fig.update_layout(
    width=1500,
    height=400,
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
    legend=dict(title_font_size=14, font_size=12)
)
fig.show()


The following graphs are only a recreation of the above above insights for the sake of report

In [None]:
'''import plotly.express as px

# -------------------------
# Function to style numeric histograms
# -------------------------
def style_numeric_histogram(fig, xaxis_title, yaxis_title, title_text, color=None):
    # Update text on bars
    fig.update_traces(
        texttemplate='%{y}',
        textposition='outside',
        textfont=dict(size=16, color='black'),
        marker_color=color if color else None
    )
    
    # Update layout styling
    fig.update_layout(
       width=1500, 
       height=800,
        xaxis_title=dict(text=xaxis_title, font=dict(size=18, color='black')),
        yaxis_title=dict(text=yaxis_title, font=dict(size=18, color='black')),
        xaxis=dict(showgrid=False, tickfont=dict(size=16, color='black')),
        yaxis=dict(showgrid=False, tickfont=dict(size=16, color='black')),
        title=dict(
            text=title_text,
            font=dict(size=22, color='black'),
            x=0.5,
            xanchor='center'
        ),
        bargap=0.1,
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        uniformtext_minsize=8,
        uniformtext_mode='hide',
        showlegend=False
    )
    
    return fig


# -------------------------
# Numeric columns and optional colors
# -------------------------
numeric_cols = ['age']
colors = ['#4c78a8']  # Optional: one color per numeric col

for col, color in zip(numeric_cols, colors):
    fig = px.histogram(
        df,
        x=col,
        nbins=30,
        labels={col: col.replace("_"," ").title(), 'count':'Count'},
        title=f'Distribution of {col.replace("_"," ").title()}'
    )
    
    fig = style_numeric_histogram(
        fig,
        xaxis_title=col.replace("_"," ").title(),
        yaxis_title='Count',
        title_text=f'Distribution of {col.replace("_"," ").title()}',
        color=color
    )
    
    fig.show()'''


In [None]:
'''import plotly.express as px

# -------------------------
# Function to style line plots
# -------------------------
def style_line_plot(fig, xaxis_title, yaxis_title, title_text):
    for trace in fig.data:
        trace.update(
            mode='lines+markers',  # show both line and markers
            marker=dict(size=8)
        )
    
    fig.update_layout(
        width=1500,
        height=600,
        xaxis_title=dict(text=xaxis_title, font=dict(size=25, color='black')),
        yaxis_title=dict(text=yaxis_title, font=dict(size=25, color='black')),
        xaxis=dict(showgrid=False, tickfont=dict(size=20, color='black')),
        yaxis=dict(showgrid=False, tickfont=dict(size=20, color='black')),
        title=dict(
            text=title_text,
            font=dict(size=25, color='black'),
            x=0.5,
            xanchor='center'
        ),
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        legend=dict(title_font_size=20, font_size=18)
    )
    return fig

# -------------------------
# Prepare daily counts
# -------------------------
df['Date'] = df['Timestamp'].dt.date
daily_counts = df.groupby(['Date','Purchase_Frequency']).size().reset_index(name='Count')

# -------------------------
# Line plot: Timestamp vs Purchase Frequency
# -------------------------
fig = px.line(
    daily_counts,
    x='Date',
    y='Count',
    color='Purchase_Frequency',
    labels={'Count':'Number of Purchases','Date':'Date','Purchase_Frequency':'Purchase Frequency'}
)

fig = style_line_plot(
    fig,
    xaxis_title='Date',
    yaxis_title='Number of Purchases',
    title_text='Daily Purchase Frequency Over Time'
)

fig.show()'''


In [None]:
'''import plotly.express as px

# -------------------------
# 1 Shopping Satisfaction by Purchase Frequency and Age (Boxplot)
# -------------------------
fig = px.box(
    df,
    x='Purchase_Frequency',
    y='age',
    color='Shopping_Satisfaction',
    labels={'age':'Age','Purchase_Frequency':'Purchase Frequency','Shopping_Satisfaction':'Shopping Satisfaction'}
)
fig.update_layout(
    width=1500,
    height=600,
    title=dict(text='Shopping Satisfaction by Purchase Frequency and Age', font=dict(size=25, color='black'), x=0.5, xanchor='center'),
    xaxis_title=dict(text='Purchase Frequency', font=dict(size=22, color='black')),
    yaxis_title=dict(text='Age', font=dict(size=22, color='black')),
    xaxis=dict(showgrid=False, zeroline=False, tickfont=dict(size=18, color='black')),
    yaxis=dict(showgrid=False, zeroline=False, tickfont=dict(size=18, color='black')),  # no horizontal grid lines
    legend=dict(title_font_size=20, font_size=18),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)
fig.show()'''

In [None]:
# -------------------------
# 3 Cart Abandonment Factors × Purchase Frequency × Cart Completion Frequency (Bar Charts)
# -------------------------
'''faceted_data = df.groupby(
    ['Cart_Abandonment_Factors','Purchase_Frequency','Cart_Completion_Frequency']
).size().reset_index(name='Count')

fig = px.bar(
    faceted_data,
    x='Cart_Abandonment_Factors',
    y='Count',
    color='Cart_Completion_Frequency',
    facet_col='Purchase_Frequency',
    text='Count'
)

# Style the data labels
fig.update_traces(textposition='outside', textfont=dict(size=18, color='black'))

# General layout
fig.update_layout(
    width=1500,
    height=800,
    title=dict(
        text='Cart Abandonment Factors × Purchase Frequency × Cart Completion Frequency',
        font=dict(size=18, color='black'), x=0.5, xanchor='center'
    ),
    xaxis_title=dict(text='Cart Abandonment Factors', font=dict(size=5, color='black')),
    yaxis_title=dict(text='Count', font=dict(size=18, color='black')),
    legend=dict(title_font_size=18, font_size=18, font=dict(color='black')),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)

# Remove grid and zeroline from all axes, standardize tick font
for axis in fig.layout:
    if axis.startswith('xaxis') or axis.startswith('yaxis'):
        fig.layout[axis].showgrid = False
        fig.layout[axis].zeroline = False
        fig.layout[axis].tickfont = dict(size=18, color='black')
        if fig.layout[axis].title:
            fig.layout[axis].title.font.size = 18
            fig.layout[axis].title.font.color = 'black'

# Facet title styling (consistent font)
for annotation in fig.layout.annotations:
    annotation.font.size = 18
    annotation.font.color = 'black'
    if "Purchase_Frequency" in annotation.text:
        annotation.text = annotation.text.replace("Purchase_Frequency=", "")

fig.show()'''
