In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [22]:
# 加载数据
train_data = pd.read_csv('train_format1.csv')
test_data = pd.read_csv('test_format1.csv')
user_info = pd.read_csv('user_info_format1.csv')
user_log = pd.read_csv('user_log_format1.csv')

In [23]:
# 合并用户信息到训练数据和测试数据
train_data = train_data.merge(user_info, on='user_id', how='left')
test_data = test_data.merge(user_info, on='user_id', how='left')

In [24]:
# 删除prob列
test_data = test_data.drop(columns=['prob'], errors='ignore')

In [25]:
# 使用SimpleImputer填补缺失值
imputer = SimpleImputer(strategy='mean')
train_data_imputed = imputer.fit_transform(train_data.drop(columns=['user_id', 'merchant_id', 'label']))
test_data_imputed = imputer.transform(test_data.drop(columns=['user_id', 'merchant_id']))

In [26]:
# 提取特征和标签
X = train_data_imputed
y = train_data['label']

In [27]:
# 分割数据集为训练集和测试集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# 初始化随机森林模型
rf_model = RandomForestClassifier(random_state=42)

In [29]:
# 定义超参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [30]:
# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [31]:
# 输出最佳超参数
print(f'最佳超参数: {grid_search.best_params_}')

最佳超参数: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [32]:
# 使用最佳超参数重新训练模型
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)

In [33]:
# 进行预测
y_pred = best_rf_model.predict_proba(X_val)[:, 1]

In [34]:
# 评估模型性能
auc_score = roc_auc_score(y_val, y_pred)
print(f'优化后随机森林模型的AUC得分: {auc_score}')

优化后随机森林模型的AUC得分: 0.5376808290319056


In [35]:
# 预测测试集
test_data['prob'] = best_rf_model.predict_proba(test_data_imputed)[:, 1]

In [37]:
# 保存结果
result = test_data[['user_id', 'merchant_id', 'prob']]
result.to_csv('prediction_forest.csv', index=False)
print("预测结果已保存至 prediction.csv")

预测结果已保存至 prediction.csv
