# Amazon Consumer Behaviour EDA

In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#### Loading the Dataset

In [161]:
df = pd.read_csv('Amazon_Survey.csv')

In [162]:
df.head(5)

Unnamed: 0,Timestamp,age,Gender,Purchase_Frequency,Purchase_Categories,Personalized_Recommendation_Frequency,Browsing_Frequency,Product_Search_Method,Search_Result_Exploration,Customer_Reviews_Importance,...,Saveforlater_Frequency,Review_Left,Review_Reliability,Review_Helpfulness,Personalized_Recommendation_Frequency.1,Recommendation_Helpfulness,Rating_Accuracy,Shopping_Satisfaction,Service_Appreciation,Improvement_Areas
0,2023/06/04 1:28:19 PM GMT+5:30,23,Female,Few times a month,Beauty and Personal Care,Yes,Few times a week,Keyword,Multiple pages,1,...,Sometimes,Yes,Occasionally,Yes,2,Yes,1,1,Competitive prices,Reducing packaging waste
1,2023/06/04 2:30:44 PM GMT+5:30,23,Female,Once a month,Clothing and Fashion,Yes,Few times a month,Keyword,Multiple pages,1,...,Rarely,No,Heavily,Yes,2,Sometimes,3,2,Wide product selection,Reducing packaging waste
2,2023/06/04 5:04:56 PM GMT+5:30,24,Prefer not to say,Few times a month,Groceries and Gourmet Food;Clothing and Fashion,No,Few times a month,Keyword,Multiple pages,2,...,Rarely,No,Occasionally,No,4,No,3,3,Competitive prices,Product quality and accuracy
3,2023/06/04 5:13:00 PM GMT+5:30,24,Female,Once a month,Beauty and Personal Care;Clothing and Fashion;...,Sometimes,Few times a month,Keyword,First page,5,...,Sometimes,Yes,Heavily,Yes,3,Sometimes,3,4,Competitive prices,Product quality and accuracy
4,2023/06/04 5:28:06 PM GMT+5:30,22,Female,Less than once a month,Beauty and Personal Care;Clothing and Fashion,Yes,Few times a month,Filter,Multiple pages,1,...,Rarely,No,Heavily,Yes,4,Yes,2,2,Competitive prices,Product quality and accuracy


#### Displaying the total number of rows and columns in the dataset

In [163]:
rows, columns = df.shape
print(f'This dataset comatins {rows} rows and {columns} columns')

This dataset comatins 602 rows and 23 columns


### Displaying the column names

In [164]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Personalized_Recommendation_Frequency',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Personalized_Recommendation_Frequency ', 'Recommendation_Helpfulness',
       'Rating_Accuracy ', 'Shopping_Satisfaction', 'Service_Appreciation',
       'Improvement_Areas'],
      dtype='str')

#### Based on the publicly available Amazon metadata, the meaning of each column in the dataset is as follows:

- age: age
- gender: gender
- Purchase_Frequency: How frequently do you make purchases on Amazon?
- Purchase_Categories: What product categories do you typically purchase on Amazon?
- Personalized_Recommendation_Frequency: Have you ever made a purchase based on personalized product recommendations from Amazon?
- Browsing_Frequency: How often do you browse Amazon's website or app?
- Product_Search_Method: How do you search for products on Amazon?
- Search_Result_Exploration: Do you tend to explore multiple pages of search results or focus on the first page?
- Customer_Reviews_Importance: How important are customer reviews in your decision-making process?
- Add_to_Cart_Browsing: Do you add products to your cart while browsing on Amazon?
- Cart_Completion_Frequency: How often do you complete the purchase after adding products to your cart?
- Cart_Abandonment_Factors: What factors influence your decision to abandon a purchase in your cart?
- Saveforlater_Frequency: Do you use Amazon's "Save for Later" feature, and if so, how often?
- Review_Left: Have you ever left a product review on Amazon?
- Review_Reliability: How much do you rely on product reviews when making a purchase?
- Review_Helpfulness: Do you find helpful information from other customers' reviews?
- Personalized_Recommendation_Frequency: How often do you receive personalized product recommendations from Amazon?
- Recommendation_Helpfulness: Do you find the recommendations helpful?
- Rating_Accuracy: How would you rate the relevance and accuracy of the recommendations you receive
- Shopping_Satisfaction: How satisfied are you with your overall shopping experience on Amazon?
- Service_Appreciation: What aspects of Amazon's services do you appreciate the most?
- Improvement_Areas: Are there any areas where you think Amazon can improve?



In [165]:
df.describe()

Unnamed: 0,age,Customer_Reviews_Importance,Personalized_Recommendation_Frequency,Rating_Accuracy,Shopping_Satisfaction
count,602.0,602.0,602.0,602.0,602.0
mean,30.790698,2.480066,2.699336,2.672757,2.463455
std,10.193276,1.185226,1.042028,0.899744,1.012152
min,3.0,1.0,1.0,1.0,1.0
25%,23.0,1.0,2.0,2.0,2.0
50%,26.0,3.0,3.0,3.0,2.0
75%,36.0,3.0,3.0,3.0,3.0
max,67.0,5.0,5.0,5.0,5.0


In [166]:
df.dtypes

Timestamp                                   str
age                                       int64
Gender                                      str
Purchase_Frequency                          str
Purchase_Categories                         str
Personalized_Recommendation_Frequency       str
Browsing_Frequency                          str
Product_Search_Method                       str
Search_Result_Exploration                   str
Customer_Reviews_Importance               int64
Add_to_Cart_Browsing                        str
Cart_Completion_Frequency                   str
Cart_Abandonment_Factors                    str
Saveforlater_Frequency                      str
Review_Left                                 str
Review_Reliability                          str
Review_Helpfulness                          str
Personalized_Recommendation_Frequency     int64
Recommendation_Helpfulness                  str
Rating_Accuracy                           int64
Shopping_Satisfaction                   

#### Obersvations on Data Types Clearly Some columns in the dataset are **not in the proper data type** for analysis.

- Timestamp is stored as str but should be converted to datetime.
- Personalized_Recommendation_Frequency appears twice with different data types (str and int64). But as per the data description, these two columns serve different purposes. So we will have to rename them to differentiate it accordingly.
- The columns - Customer_Reviews_Importance, Personalized_Recommendation_Frequency, Rating_Accuracy & Shopping_Satisfaction, even if they are present in int64, but these are essentially categorical varaibles.  

These columns will need **cleaning and type conversion** before the analysis.

#### Checking all the unique values of each of the column

In [167]:
for col in df.columns:
    unique_val = df[col].unique()
    print(f"Column Name : {col}")
    print(f'Number of unique values : {len(unique_val)}')
    print(f'Unique Values : {unique_val}')


Column Name : Timestamp
Number of unique values : 601
Unique Values : <StringArray>
['2023/06/04 1:28:19 PM GMT+5:30', '2023/06/04 2:30:44 PM GMT+5:30',
 '2023/06/04 5:04:56 PM GMT+5:30', '2023/06/04 5:13:00 PM GMT+5:30',
 '2023/06/04 5:28:06 PM GMT+5:30', '2023/06/04 6:01:59 PM GMT+5:30',
 '2023/06/04 6:31:41 PM GMT+5:30', '2023/06/04 7:13:12 PM GMT+5:30',
 '2023/06/04 7:23:21 PM GMT+5:30', '2023/06/04 7:33:12 PM GMT+5:30',
 ...
 '2023/06/12 3:56:57 PM GMT+5:30', '2023/06/12 3:57:52 PM GMT+5:30',
 '2023/06/12 3:59:10 PM GMT+5:30', '2023/06/12 3:59:59 PM GMT+5:30',
 '2023/06/12 4:00:56 PM GMT+5:30', '2023/06/12 4:02:02 PM GMT+5:30',
 '2023/06/12 4:02:53 PM GMT+5:30', '2023/06/12 4:03:59 PM GMT+5:30',
 '2023/06/12 9:57:20 PM GMT+5:30', '2023/06/16 9:16:05 AM GMT+5:30']
Length: 601, dtype: str
Column Name : age
Number of unique values : 50
Unique Values : [23 24 22 21 20 25 16 64 29 19 26 32 30 40 36 31 47 54 58 53 28 55 62 27
 34 44 38 35 42 37 45 50 63 46 33 60 18 17 57 41 39 48 49 15 

#### Summary from the observations

- Columns with correct types:
age, Gender, Purchase_Categories, Product_Search_Method, Search_Result_Exploration, Customer_Reviews_Importance, Add_to_Cart_Browsing, Cart_Abandonment_Factors, Review_Left, Service_Appreciation, Improvement_Areas, Rating_Accuracy, Shopping_Satisfaction are mostly fine.

- Columns needing conversions/cleaning:
Timestamp (stored as str instead of datetime) needs to be converted for time-based analysis.
Personalized_Recommendation_Frequency appears twice with different data types (str and int64) and must be resolved (rename or remove duplication).

- Columns with potential considerations (ordinal categorical variables):
Purchase_Frequency, Browsing_Frequency, Cart_Completion_Frequency, Saveforlater_Frequency, Review_Reliability, Review_Helpfulness, Recommendation_Helpfulness - these are categorical but represent frequency/degree (Never → Always, Occasionally → Heavily). Consider marking them as ordered categorical for meaningful analysis.

- Columns with potential anomalies / considerations:
    - Purchase_Categories: multiple categories per entry separated by ;, may require splitting or encoding for analysis.
    - Service_Appreciation: entries like . or duplicates (Customer service vs Customer service) may need cleaning.
    - Improvement_Areas: contains minor variations, punctuation differences, or long strings; may require grouping or text cleaning.
    - In the Improvement_Areas column there is a row which contains '.' which is certainly an error and can be replace by Mode


#### Checking for the missing values per column (uncluding empty strings)

In [168]:
nan_count = df.isnull().sum()

empty_count = (df == '').sum()

total_missing = nan_count + empty_count

missing_percent = (total_missing/len(df))*100

missing_df = pd.DataFrame({
    'Missing Values': total_missing,
    'Percentage (%)' : missing_percent
}).sort_values(by='Percentage (%)', ascending=True)

print(missing_df)

                                        Missing Values  Percentage (%)
Timestamp                                            0        0.000000
age                                                  0        0.000000
Gender                                               0        0.000000
Purchase_Frequency                                   0        0.000000
Purchase_Categories                                  0        0.000000
Personalized_Recommendation_Frequency                0        0.000000
Browsing_Frequency                                   0        0.000000
Search_Result_Exploration                            0        0.000000
Cart_Abandonment_Factors                             0        0.000000
Customer_Reviews_Importance                          0        0.000000
Add_to_Cart_Browsing                                 0        0.000000
Cart_Completion_Frequency                            0        0.000000
Review_Left                                          0        0.000000
Savefo

## Data Cleaning

- Since Product_Search_Method is a categorical varaibles, we'll replace the 2 missing values using - Mode imputation
- Timestamp Column will be converted to the respective data type
- For Personalized_Recommendation_Frequency. There two features with the same name. But as per the data description, these two columns server different purposes. So we will have to rename them to differentiate it accordingly

##### Firstly, lets convert the data type for 'time stamp'

In [169]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce', utc=True)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [170]:
df['Timestamp'].dtype

datetime64[us, UTC]

#### Mode Imputation for Product_Search_Method and Improvement_Areas

In [171]:
df['Product_Search_Method'] = df['Product_Search_Method'].fillna(df['Product_Search_Method'].mode()[0])

cols = ['Service_Appreciation', 'Improvement_Areas']

junk_values = [
    '.', 'Nil', 'Nothing',
    "I don't have any problem with Amazon",
    'No problems with Amazon'
]

for col in cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .replace(junk_values, pd.NA)
    )

df['Service_Appreciation'] = df['Service_Appreciation'].fillna(
    df['Service_Appreciation'].mode()[0]
)

df['Improvement_Areas'] = df['Improvement_Areas'].fillna(
    df['Improvement_Areas'].mode()[0]
)

In [172]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Personalized_Recommendation_Frequency',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Personalized_Recommendation_Frequency ', 'Recommendation_Helpfulness',
       'Rating_Accuracy ', 'Shopping_Satisfaction', 'Service_Appreciation',
       'Improvement_Areas'],
      dtype='str')

In [173]:
df = df.rename(columns={
    'Personalized_Recommendation_Frequency ': 'Number_of_times_Personalized_Recommendation_Received',
    'Personalized_Recommendation_Frequency': 'Purchase_made_on_Personalized_Recommendation',
    'Rating_Accuracy ': 'Rating_Accuracy'
})

In [174]:
df.columns

Index(['Timestamp', 'age', 'Gender', 'Purchase_Frequency',
       'Purchase_Categories', 'Purchase_made_on_Personalized_Recommendation',
       'Browsing_Frequency', 'Product_Search_Method',
       'Search_Result_Exploration', 'Customer_Reviews_Importance',
       'Add_to_Cart_Browsing', 'Cart_Completion_Frequency',
       'Cart_Abandonment_Factors', 'Saveforlater_Frequency', 'Review_Left',
       'Review_Reliability', 'Review_Helpfulness',
       'Number_of_times_Personalized_Recommendation_Received',
       'Recommendation_Helpfulness', 'Rating_Accuracy',
       'Shopping_Satisfaction', 'Service_Appreciation', 'Improvement_Areas'],
      dtype='str')

In [175]:
# List of categorical/ordinal columns in the original DataFrame
categorical_columns = [
    'Gender',
    'Purchase_Frequency',
    'Purchase_Categories',
    'Purchase_made_on_Personalized_Recommendation',
    'Browsing_Frequency',
    'Product_Search_Method',
    'Search_Result_Exploration',
    'Add_to_Cart_Browsing',
    'Cart_Completion_Frequency',
    'Cart_Abandonment_Factors',
    'Saveforlater_Frequency',
    'Review_Left',
    'Review_Reliability',
    'Review_Helpfulness',
    'Recommendation_Helpfulness',
    'Service_Appreciation',
    'Improvement_Areas',
    'Customer_Reviews_Importance',
    'Number_of_times_Personalized_Recommendation_Received',
    'Rating_Accuracy',
    'Shopping_Satisfaction'
]

# Convert the columns in-place in the original df
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Verify conversion
print(df.dtypes)


Timestamp                                               datetime64[us, UTC]
age                                                                   int64
Gender                                                             category
Purchase_Frequency                                                 category
Purchase_Categories                                                category
Purchase_made_on_Personalized_Recommendation                       category
Browsing_Frequency                                                 category
Product_Search_Method                                              category
Search_Result_Exploration                                          category
Customer_Reviews_Importance                                        category
Add_to_Cart_Browsing                                               category
Cart_Completion_Frequency                                          category
Cart_Abandonment_Factors                                           category
Saveforlater

# Time for Analysis

### Univariate Analysis

In [191]:
import plotly.express as px

# -------------------------
# CATEGORICAL VARIABLES
# -------------------------
categorical_cols = [
    'Gender', 
    'Purchase_Frequency',
    'Purchase_Categories',
    'Purchase_made_on_Personalized_Recommendation',
    'Browsing_Frequency',
    'Product_Search_Method',
    'Search_Result_Exploration',
    'Customer_Reviews_Importance',
    'Add_to_Cart_Browsing',
    'Cart_Completion_Frequency',
    'Cart_Abandonment_Factors',
    'Saveforlater_Frequency',
    'Review_Reliability',
    'Review_Helpfulness',
    'Number_of_times_Personalized_Recommendation_Received',
    'Recommendation_Helpfulness',
    'Rating_Accuracy',
    'Shopping_Satisfaction',
    'Service_Appreciation',
    'Improvement_Areas'
]

for col in categorical_cols:
    counts = (
        df[col]
        .value_counts()
        .rename_axis(col)
        .reset_index(name='Count')
    )
    fig = px.bar(
        counts,
        x='Count',
        y=col,
        orientation='h',
        text='Count',
        title=f'{col.replace("_"," ").title()} Distribution'
    )
    fig.update_layout(yaxis={'categoryorder':'total descending'})
    fig.update_traces(textposition='outside')
    fig.show()

In [192]:
# -------------------------
# NUMERIC VARIABLES
# -------------------------
numeric_cols = ['age']

for col in numeric_cols:
    fig = px.histogram(
        df,
        x=col,
        nbins=30,
        title=f'Distribution of {col.replace("_"," ").title()}',
        labels={col: col.replace("_"," ").title(), 'count':'Count'}
    )
    fig.update_traces(
        texttemplate='%{y}',
        textposition='outside',
        textfont_size=14
    )
    fig.update_layout(
        xaxis_title=col.replace("_"," ").title(),
        yaxis_title='Count',
        bargap=0.1
    )
    fig.show()

In [193]:
# -------------------------
# DATE VARIABLES
# -------------------------
date_cols = ['Timestamp']

for col in date_cols:
    fig = px.histogram(
        df,
        x=col,
        nbins=50,
        title=f'Distribution of {col.replace("_"," ").title()}',
        labels={col: col.replace("_"," ").title(), 'count':'Count'}
    )
    fig.update_traces(
        texttemplate='%{y}',
        textposition='outside',
        textfont_size=14
    )
    fig.show()

In [194]:
# -------------------------
# BOOLEAN / YES-NO VARIABLES
# -------------------------
bool_cols = [
    'Purchase_made_on_Personalized_Recommendation',
    'Review_Left'
]

for col in bool_cols:
    bool_counts = (
        df[col]
        .value_counts()
        .rename_axis(col.replace("_"," ").title())
        .reset_index(name='Count')
    )
    x_col = bool_counts.columns[0]
    fig = px.bar(
        bool_counts,
        x=x_col,
        y='Count',
        title=f'{col.replace("_"," ").title()} Distribution',
        text='Count'
    )
    fig.update_traces(textposition='outside')
    fig.show()

## Bivariate Analysis

In [180]:
# -------------------------
# NUMERIC vs NUMERIC (treated as categorical if ordinal)
# -------------------------

# 1. Customer Reviews Importance vs Shopping Satisfaction
fig = px.box(
    df,
    x='Customer_Reviews_Importance',  # treat as categorical/ordinal
    y='Shopping_Satisfaction',
    title='Customer Reviews Importance vs Shopping Satisfaction',
    labels={
        'Customer_Reviews_Importance':'Customer Reviews Importance',
        'Shopping_Satisfaction':'Shopping Satisfaction'
    },
    color='Customer_Reviews_Importance'
)
fig.show()

# 2. Cart Completion Frequency vs Browsing Frequency (grouped by Purchase Frequency)
fig = px.histogram(
    df,
    x='Cart_Completion_Frequency', 
    color='Browsing_Frequency', 
    facet_col='Purchase_Frequency',
    barmode='group',
    title='Cart Completion vs Browsing Frequency by Purchase Frequency',
    labels={
        'Cart_Completion_Frequency':'Cart Completion Frequency',
        'Browsing_Frequency':'Browsing Frequency',
        'Purchase_Frequency':'Purchase Frequency'
    }
)
fig.show()

In [181]:
# -------------------------
# NUMERIC vs CATEGORICAL
# -------------------------

# 1. Shopping Satisfaction vs Age
fig = px.box(
    df,
    x='age',
    y='Shopping_Satisfaction',
    title='Shopping Satisfaction Across Age Groups',
    labels={'age':'Age', 'Shopping_Satisfaction':'Shopping Satisfaction'},
    color='age'
)
fig.show()

# 2. Purchase Frequency vs Gender
fig = px.box(
    df,
    x='Gender',
    y='Purchase_Frequency',
    title='Purchase Frequency Across Gender',
    labels={'Gender':'Gender', 'Purchase_Frequency':'Purchase Frequency'},
    color='Gender'
)
fig.show()

In [182]:
# -------------------------
# CATEGORICAL vs CATEGORICAL
# -------------------------

# 1. Product Search Method vs Purchase Made on Personalized Recommendation
fig = px.histogram(
    df,
    x='Product_Search_Method',
    color='Purchase_made_on_Personalized_Recommendation',
    barmode='stack',
    title='Product Search Method vs Purchase Made on Personalized Recommendation'
)
fig.show()

# 2. Cart Abandonment Factors vs Age
fig = px.histogram(
    df,
    x='age',
    color='Cart_Abandonment_Factors',
    barmode='stack',
    title='Cart Abandonment Factors Across Age Groups'
)
fig.show()

In [183]:
# -------------------------
# DATE vs NUMERIC
# -------------------------

# Total Purchases over Timestamp
df_timestamp_agg = df.groupby('Timestamp').size().reset_index(name='Total_Purchases')

fig = px.line(
    df_timestamp_agg,
    x='Timestamp',
    y='Total_Purchases',
    title='Total Purchases Over Time',
    labels={'Timestamp':'Timestamp', 'Total_Purchases':'Total Purchases'}
)
fig.show()

## Multivaraite Analysis

In [184]:
import plotly.express as px
import pandas as pd

# -------------------------
# Purchase Frequency × Browsing Frequency × Cart Completion (Heatmap)
# -------------------------
# Use counts instead of mean
heatmap_data = df.groupby(['Purchase_Frequency','Browsing_Frequency','Cart_Completion_Frequency']).size().reset_index(name='Count')

fig = px.density_heatmap(
    heatmap_data,
    x='Purchase_Frequency',
    y='Browsing_Frequency',
    z='Count',
    color_continuous_scale='Viridis',
    title='Cart Completion Counts by Purchase & Browsing Frequency'
)
fig.show()

# -------------------------
# Number of Personalized Recommendations × Purchase Made × Shopping Satisfaction
# -------------------------
# Treat Shopping_Satisfaction as categorical in color and size
fig = px.scatter(
    df,
    x='Number_of_times_Personalized_Recommendation_Received',
    y='Purchase_made_on_Personalized_Recommendation',
    color='Shopping_Satisfaction',
    size=df['Shopping_Satisfaction'].astype('category').cat.codes + 1,  # numeric size from categorical
    title='Effect of Personalized Recommendations on Purchases and Satisfaction',
    labels={
        'Number_of_times_Personalized_Recommendation_Received':'Number of Personalized Recommendations',
        'Purchase_made_on_Personalized_Recommendation':'Purchase Made'
    }
)
fig.show()

# -------------------------
# Purchase Categories × Age × Shopping Satisfaction (Heatmap)
# -------------------------
# Use counts instead of mean
heatmap_data2 = df.groupby(['Purchase_Categories','age','Shopping_Satisfaction']).size().reset_index(name='Count')

fig = px.density_heatmap(
    heatmap_data2,
    x='Purchase_Categories',
    y='age',
    z='Count',
    color_continuous_scale='Plasma',
    title='Shopping Satisfaction Counts by Purchase Category and Age'
)
fig.update_xaxes(tickangle=45)
fig.show()

# -------------------------
# Cart Abandonment Factors × Browsing Frequency × Purchase Frequency (Stacked)
# -------------------------
fig = px.histogram(
    df,
    x='Cart_Abandonment_Factors',
    color='Browsing_Frequency',
    facet_col='Purchase_Frequency',
    barmode='stack',
    title='Cart Abandonment Factors by Browsing & Purchase Frequency'
)
fig.show()
