In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('4_user_behavior_data.csv')
print(data.head())

   UserID UserName  Age  Gender    Location   LastLogin  PurchaseAmount  \
0       1   User_1   62  Female  Location_1  2023-06-10             118   
1       2   User_2   65  Female  Location_2  2023-08-14             466   
2       3   User_3   18    Male  Location_3  2023-02-17             869   
3       4   User_4   21  Female  Location_4  2023-03-14             486   
4       5   User_5   21    Male  Location_5  2023-07-26             753   

  PurchaseCategory  ReviewScore LoginFrequency  
0         Clothing            3        Monthly  
1      Electronics            4         Weekly  
2    Home & Garden            3         Weekly  
3            Books            2         Weekly  
4    Home & Garden            1        Monthly  


In [5]:
# 1. Data cleaning and pre-handling
data = data.dropna()
data['Age'] = data['Age'].astype(int)
data['PurchaseAmount'] = data['PurchaseAmount'].astype(float)
data['ReviewScore'] = data['ReviewScore'].astype(int)
data = data[(data['Age'].between(18, 70)) & (data['PurchaseAmount'] > 0) & (data['ReviewScore'].between(1, 5))]
data['PurchaseAmount'] = (data['PurchaseAmount'] - data['PurchaseAmount'].mean()) / data['PurchaseAmount'].std()
data['ReviewScore'] = (data['ReviewScore'] - data['ReviewScore'].mean()) / data['ReviewScore'].std()

data.to_csv('4_user_behavior_data_cleaned.csv', index=False)
print("Data cleaning complete. Saved as '4_user_behavior_data_cleaned.csv'")

Data cleaning complete. Saved as '4_user_behavior_data_cleaned.csv'


In [6]:
# 2. Data Statistics
purchase_category_counts = data['PurchaseCategory'].value_counts()
print('User count by category:', purchase_category_counts)
gender_purchase_amount_mean = data.groupby("Gender", observed=False)['PurchaseAmount'].mean()
print('Average amount by gender:', gender_purchase_amount_mean)

bins= [18, 26, 36, 46, 56, 66, np.inf]
labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)
age_group_counts=data['AgeGroup'].value_counts().sort_index()
print('User count by age:\n', age_group_counts)

User count by category: PurchaseCategory
Clothing         214
Electronics      213
Home & Garden    203
Food             197
Books            173
Name: count, dtype: int64
Average amount by gender: Gender
Female   -0.001367
Male      0.001458
Name: PurchaseAmount, dtype: float64
User count by age:
 AgeGroup
18-25    167
26-35    175
36-45    191
46-55    191
56-65    176
65+      100
Name: count, dtype: int64
