In [5]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [6]:
folder = r"D:\Studium\DM-Dataset\physics.stackexchange.com\CSV"
file1 = "posts_to_users_with_lastdate.csv"
path1 = os.path.join(folder, file1)
df = pd.read_csv(path1)

In [10]:
df['CreationDate_user'] = pd.to_datetime(df['CreationDate_user'], errors='coerce')
df['LastActivityDate'] = pd.to_datetime(df['LastActivityDate'], errors='coerce')
# 计算真正活跃天数
df['RealActiveDays'] = (
    df['LastActivityDate'] - df['CreationDate_user']
).dt.days

# 去除负值或缺失值（可选）
df = df[
    df['RealActiveDays'].notna() & 
    (df['RealActiveDays'] >= 0)
]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 555438 entries, 0 to 782266
Data columns (total 42 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Id_post                555438 non-null  float64       
 1   PostTypeId             555438 non-null  float64       
 2   AcceptedAnswerId       555438 non-null  float64       
 3   CreationDate_post      555438 non-null  object        
 4   Score                  555438 non-null  float64       
 5   ViewCount              555438 non-null  float64       
 6   Body                   555438 non-null  object        
 7   OwnerUserId            555438 non-null  float64       
 8   LastEditorUserId       276710 non-null  float64       
 9   LastEditDate           281289 non-null  object        
 10  LastActivityDate       555438 non-null  datetime64[ns]
 11  Title                  225001 non-null  object        
 12  Tags                   225001 non-null  object   

In [12]:
# 去掉 NaN 和负值（确保合法）
df_real = df[df['RealActiveDays'].notna() & (df['RealActiveDays'] >= 0)]

# 描述统计
print(df_real['RealActiveDays'].describe(percentiles=[.25, .5, .75, .9, .95, .99]))

count    555438.000000
mean        916.163799
std        1011.086916
min           0.000000
25%          75.000000
50%         548.000000
75%        1460.000000
90%        2464.000000
95%        3042.000000
99%        3954.000000
max        4900.000000
Name: RealActiveDays, dtype: float64


In [16]:
# 时间字段标准化
df['CreationDate_user'] = pd.to_datetime(df['CreationDate_user'], errors='coerce')
df['LastActivityDate'] = pd.to_datetime(df['LastActivityDate'], errors='coerce')
df['CreationDate_post'] = pd.to_datetime(df['CreationDate_post'], errors='coerce')

# 获取每个用户活跃的起止月份（以 Period[M] 表示）
df['StartMonth'] = df['CreationDate_user'].dt.to_period('M')
df['EndMonth'] = df['LastActivityDate'].dt.to_period('M')

In [14]:
# 函数：生成一个用户的所有月份区间
def generate_months(user_id, start, end):
    if pd.isna(start) or pd.isna(end):
        return []
    months = pd.period_range(start=start, end=end, freq='M')
    return [(user_id, m) for m in months]

# 应用到所有用户
user_month_pairs = []
for _, row in df[['OwnerUserId', 'StartMonth', 'EndMonth']].dropna().iterrows():
    user_month_pairs.extend(generate_months(row['OwnerUserId'], row['StartMonth'], row['EndMonth']))

# 转为 DataFrame
user_month_active = pd.DataFrame(user_month_pairs, columns=['OwnerUserId', 'YearMonth'])
user_month_active['is_active'] = 1

In [17]:
# 真实发帖数
df['YearMonth'] = df['CreationDate_post'].dt.to_period('M')
monthly_post_count = df.groupby(['OwnerUserId', 'YearMonth'])['Id_post'].count().reset_index()
monthly_post_count.columns = ['OwnerUserId', 'YearMonth', 'post_count']

# 合并到 user_month_active 表中
user_month_active = user_month_active.merge(monthly_post_count, on=['OwnerUserId', 'YearMonth'], how='left')
user_month_active['post_count'] = user_month_active['post_count'].fillna(0)

In [18]:
# 按用户、时间排序
user_month_active = user_month_active.sort_values(['OwnerUserId', 'YearMonth'])

# 添加滞后特征（post_count 可扩展）
for i in range(1, 4):
    user_month_active[f'post_count_t-{i}'] = (
        user_month_active.groupby('OwnerUserId')['post_count'].shift(i)
    )

In [None]:
# 排序
user_month_active = user_month_active.sort_values(['OwnerUserId', 'YearMonth'])

# 滚动拿下一个月的 is_active 值，作为 label（0 或 1）
user_month_active['label'] = (
    user_month_active.groupby('OwnerUserId')['is_active'].shift(-1)
).fillna(0).astype(int)

In [21]:
# 清理数据（要求特征 & 标签都存在）
model_df = user_month_active.dropna(subset=[
    'post_count_t-1', 'post_count_t-2', 'post_count_t-3', 'label'
])

# 特征和标签
feature_cols = ['post_count_t-1', 'post_count_t-2', 'post_count_t-3']
X = model_df[feature_cols]
y = model_df['label'].astype(int)

In [23]:
print(user_month_active['label'].value_counts(dropna=False))

label
1.0    17187873
NaN       92937
Name: count, dtype: int64


In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]