In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv("../data/processed/customer_behavior_cleaned.csv")
df.head()

Unnamed: 0,customer_id,gender,age,estimated_salary,purchased,purchase_status,age_group
0,15624510,Male,19,19000,0,No,18-25
1,15810944,Male,35,20000,0,No,26-35
2,15668575,Female,26,43000,0,No,26-35
3,15603246,Female,27,57000,0,No,26-35
4,15804002,Male,19,76000,0,No,18-25


## Business Questions

1. Which age groups are most common among customers?
2. Do older customers spend more than younger customers?
3. What is the purchase frequency by age group?
4. Which customer segments are most valuable for the business?


In [3]:
df["age_group"].value_counts().sort_index()


age_group
18-25     44
26-35    129
36-45    119
46-60    103
Name: count, dtype: int64

In [4]:
df.groupby("age_group")["purchase_amount"].mean().round(2)


KeyError: 'Column not found: purchase_amount'

In [5]:
df.columns


Index(['customer_id', 'gender', 'age', 'estimated_salary', 'purchased',
       'purchase_status', 'age_group'],
      dtype='object')

In [6]:
df["age_group"].value_counts().sort_index()


age_group
18-25     44
26-35    129
36-45    119
46-60    103
Name: count, dtype: int64

In [7]:
df.groupby("age_group")["purchased"].sum()

age_group
18-25     0
26-35    17
36-45    38
46-60    88
Name: purchased, dtype: int64

In [8]:
df.groupby("age_group")["purchased"].mean().round(2)


age_group
18-25    0.00
26-35    0.13
36-45    0.32
46-60    0.85
Name: purchased, dtype: float64

In [9]:
df.groupby("gender")["purchased"].sum()
df.groupby("gender")["purchased"].mean().round(2)


gender
Female    0.38
Male      0.34
Name: purchased, dtype: float64

In [10]:
df.groupby("age_group")["estimated_salary"].mean().round(2)


age_group
18-25    54136.36
26-35    66790.70
36-45    75630.25
46-60    73466.02
Name: estimated_salary, dtype: float64

In [11]:
pd.crosstab(df["age_group"], df["gender"], values=df["purchased"], aggfunc="mean").round(2)


gender,Female,Male
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
18-25,0.0,0.0
26-35,0.14,0.12
36-45,0.3,0.33
46-60,0.84,0.88


In [12]:
df.to_csv("../data/processed/customer_behavior_final.csv", index=False)
