In [15]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#读入数据
users = pd.read_csv("users.csv")
events = pd.read_csv("events.csv")

#数据预处理
users['signup_date'] = pd.to_datetime(users['signup_date'], errors='coerce')
events['timestamp'] = pd.to_datetime(events['timestamp'], errors='coerce')

# 计算注册天数
today = pd.Timestamp.now().normalize()
users['days_since_signup'] = (today - users['signup_date']).dt.days

#每位用户的事件统计
event_counts = (events.groupby(['user_id', 'event_type']).size().unstack(fill_value=0))
event_counts['total_events'] = event_counts.sum(axis=1)

# 各事件类型比例
event_ratio = event_counts.div(event_counts['total_events'], axis=0).fillna(0)
event_features = pd.concat([event_counts[['total_events']],event_ratio.drop(columns='total_events', errors='ignore')], axis=1)

# 合并用户表与事件特征
user_features = (users.set_index('user_id')[['age', 'gender', 'days_since_signup']])
data = user_features.join(event_features, how='left').fillna(0)

print(data)


          age  gender  days_since_signup  total_events  click_ad     login  \
user_id                                                                      
user_001   56  Female                186            21  0.190476  0.238095   
user_002   46    Male                224            18  0.222222  0.277778   
user_003   32    Male                188            25  0.120000  0.320000   
user_004   25    Male                197             7  0.142857  0.142857   
user_005   38  Female                241             5  0.200000  0.000000   
...       ...     ...                ...           ...       ...       ...   
user_096   59   Other                217            22  0.090909  0.272727   
user_097   56    Male                185            14  0.428571  0.142857   
user_098   58  Female                235            23  0.217391  0.217391   
user_099   45  Female                217            21  0.190476  0.238095   
user_100   24  Female                216            28  0.107143

In [17]:
# 定义高活跃用户（top 25%为活跃用户）
threshold = data['total_events'].quantile(0.75)
data['label'] = (data['total_events'] >= threshold).astype(int)
data = pd.get_dummies(data, columns=['gender'], drop_first=True)

#划分数据集
X = data.drop(columns=['label'])
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

#预测与评估
y_pred = model.predict(X_test_scaled)
print("=== 分类报告 ===\n")
print(classification_report(y_test, y_pred))
#查看特征重要性
importance = pd.Series(model.coef_[0], index=X.columns).sort_values(ascending=False)
print("\n=== 特征重要性 ===\n")
print(importance.head(10))

=== 分类报告 ===

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.89      0.94         9

    accuracy                           0.97        30
   macro avg       0.98      0.94      0.96        30
weighted avg       0.97      0.97      0.97        30


=== 特征重要性 ===

total_events         3.046364
click_ad             0.189264
purchase             0.136663
gender_Male          0.128659
days_since_signup    0.096151
age                 -0.003136
login               -0.086707
logout              -0.107210
view_content        -0.196417
gender_Other        -0.304657
dtype: float64
