In [37]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
folder = r"/Users/qialiang/Documents/Studium/DM_Dataset"
file1 = "posts_to_users_with_lastdate.csv"
path1 = os.path.join(folder, file1)
df = pd.read_csv(path1)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555974 entries, 0 to 555973
Data columns (total 41 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Id_post                555974 non-null  float64
 1   PostTypeId             555974 non-null  float64
 2   AcceptedAnswerId       555974 non-null  float64
 3   CreationDate_post      555974 non-null  object 
 4   Score                  555974 non-null  float64
 5   ViewCount              555974 non-null  float64
 6   Body                   555974 non-null  object 
 7   OwnerUserId            555974 non-null  float64
 8   LastEditorUserId       276744 non-null  float64
 9   LastEditDate           281324 non-null  object 
 10  LastActivityDate       555974 non-null  object 
 11  Title                  225083 non-null  object 
 12  Tags                   225083 non-null  object 
 13  AnswerCount            555974 non-null  float64
 14  CommentCount           555974 non-nu

In [40]:
# 去掉 NaN 和负值（确保合法）
df_real = df[df['RealActiveDays'].notna() & (df['RealActiveDays'] >= 0)]

# 描述统计
print(df_real['RealActiveDays'].describe(percentiles=[.25, .5, .75, .9, .95, .99]))

count    555413.000000
mean        916.090871
std        1011.015705
min           0.000000
25%          75.000000
50%         548.000000
75%        1460.000000
90%        2464.000000
95%        3042.000000
99%        3954.000000
max        4900.000000
Name: RealActiveDays, dtype: float64


In [51]:
# 时间字段标准化
df['CreationDate_user'] = pd.to_datetime(df['CreationDate_user'], errors='coerce')
df['LastActivityDate'] = pd.to_datetime(df['LastActivityDate'], errors='coerce')

# 获取每个用户活跃的起止月份（以 Period[M] 表示）
df['StartMonth'] = df['CreationDate_user'].dt.to_period('M')
df['EndMonth'] = df['LastActivityDate'].dt.to_period('M')

In [52]:
# 函数：生成一个用户的所有月份区间
def generate_months(user_id, start, end):
    if pd.isna(start) or pd.isna(end):
        return []
    months = pd.period_range(start=start, end=end, freq='M')
    return [(user_id, m) for m in months]

# 应用到所有用户
user_month_pairs = []
for _, row in df[['OwnerUserId', 'StartMonth', 'EndMonth']].dropna().iterrows():
    user_month_pairs.extend(generate_months(row['OwnerUserId'], row['StartMonth'], row['EndMonth']))

# 转为 DataFrame
user_month_active = pd.DataFrame(user_month_pairs, columns=['OwnerUserId', 'YearMonth'])
user_month_active['is_active'] = 1

In [53]:
# 真实发帖数
df['YearMonth'] = df['CreationDate_post'].dt.to_period('M')
monthly_post_count = df.groupby(['OwnerUserId', 'YearMonth'])['Id_post'].count().reset_index()
monthly_post_count.columns = ['OwnerUserId', 'YearMonth', 'post_count']

# 合并到 user_month_active 表中
user_month_active = user_month_active.merge(monthly_post_count, on=['OwnerUserId', 'YearMonth'], how='left')
user_month_active['post_count'] = user_month_active['post_count'].fillna(0)

In [None]:
# 按用户、时间排序
user_month_active = user_month_active.sort_values(['OwnerUserId', 'YearMonth'])

# 添加滞后特征（post_count 可扩展）
for i in range(1, 4):
    user_month_active[f'post_count_t-{i}'] = (
        user_month_active.groupby('OwnerUserId')['post_count'].shift(i)
    )

: 

In [None]:
# 下一个月的 is_active 作为 label
user_month_active['next_month'] = user_month_active['YearMonth'] + 1

# 自连接获取下月是否还在活跃期
user_month_active = user_month_active.merge(
    user_month_active[['OwnerUserId', 'YearMonth', 'is_active']].rename(columns={
        'YearMonth': 'next_month',
        'is_active': 'label'
    }),
    on=['OwnerUserId', 'next_month'],
    how='left'
)

# 填补 label（NaN 表示生命周期终止 → label=0）
user_month_active['label'] = user_month_active['label'].fillna(0).astype(int)

In [None]:
# 只保留特征齐全的样本
model_df = user_month_active.dropna(subset=['post_count_t-1', 'post_count_t-2', 'post_count_t-3'])

# 特征和标签
feature_cols = ['post_count_t-1', 'post_count_t-2', 'post_count_t-3']
X = model_df[feature_cols]
y = model_df['label']

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# 划分训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# 模型训练
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# 评估
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))