In [2]:
# ========= Cell 1: 填写任务信息 =========
from datetime import datetime
import ipynbname
import os

task_name = "第一次XGBoost训练尝试"
notebook_name = "XGBoost.ipynb"  # 不带扩展名
notebook_path = r"CASA0004\XGBoost.ipynb"  # 完整路径
dataset = r"E:\Dissertation\CASA0004\central_barrio_features.csv"
code_version = "v1.0_X, (1 part of xgboost)"
input_dir = r"E:\Dissertation\CASA0004\central_barrio_features.csv"   # 输入 CSV
output_dir = r"E:\Dissertation\XGBoost_cleaning\xgboost_result"       # 输出文件夹
note = "把csv里面的na全都填上0，然后训练xgboost预测谋杀率"

# 保证输出目录存在
os.makedirs(output_dir, exist_ok=True)

print(f"任务: {task_name}")
print(f"输入文件: {input_dir}")
print(f"输出目录: {output_dir}")


任务: 第一次XGBoost训练尝试
输入文件: E:\Dissertation\CASA0004\central_barrio_features.csv
输出目录: E:\Dissertation\XGBoost_cleaning\xgboost_result


In [3]:
import pandas as pd

In [6]:
csv = pd.read_csv(input_dir, encoding="utf8")
csv.columns

Index(['barrio', 'longitude', 'latitude', 'upz_code', 'Cantidad',
       'fence_ratio', 'wall_ratio', 'road_ratio', 'sidewalk_ratio',
       'building_ratio', 'sky_ratio', 'vegetation_ratio', 'person_count',
       'bicyclist_count', 'motorcyclist_count', 'other_rider_count',
       'street_light_count', 'population', 'pop_density', 'murder_rate'],
      dtype='object')

In [11]:
# ========= Cell 2: 训练 XGBoost =========
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# 1. 读取数据
df = pd.read_csv(input_dir)

# 2. 填充缺失值
df = df.fillna(0)

# 3. 设置特征和目标变量
# ⚠️ 假设 "murder_rate" 是你的因变量列名，请根据实际 CSV 调整
target_col = "murder_rate"
y = df[target_col].values
features = ['fence_ratio', 'wall_ratio', 'road_ratio', 'sidewalk_ratio',
       'building_ratio', 'sky_ratio', 'vegetation_ratio', 'person_count',
       'bicyclist_count', 'motorcyclist_count', 'other_rider_count',
       'street_light_count', 'pop_density']
X = df[features]

sample_names = df["barrio"]   # ⚠️ 替换成实际的样本名称列


# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=666
)

# 5. 定义和训练 XGBoost 模型
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# 6. 预测与评估
# 预测所有样本
y_all_pred = model.predict(X)

result_df = pd.DataFrame({
    "sample": sample_names,
    "true_murder_rate": y,
    "pred_murder_rate": y_all_pred
})

result_df.to_csv(os.path.join(output_dir, "prediction_results.csv"), index=False, encoding="utf-8")


print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# 7. 保存模型和特征重要性
model.save_model(os.path.join(output_dir, "xgboost_model.json"))

importance = model.feature_importances_
feat_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": importance
}).sort_values(by="importance", ascending=False)

feat_importance.to_csv(
    os.path.join(output_dir, "feature_importance.csv"),
    index=False, encoding="utf-8"
)

print(f"模型和特征重要性已保存到 {output_dir}")


RMSE: 0.0050
R²: -801.6802
模型和特征重要性已保存到 E:\Dissertation\XGBoost_cleaning\xgboost_result


In [22]:
from sklearn.utils import resample

# ========== 构造分类任务数据 ==========
df["has_murder"] = (df["murder_rate"] > 0).astype(int)

# 所有有谋杀的样本
df_pos = df[df["has_murder"] == 1]

# 所有无谋杀的样本
df_neg = df[df["has_murder"] == 0]

# 只随机抽取 7 个无谋杀样本
df_neg_sampled = resample(df_neg, n_samples=7, random_state=42)

# 合并成新的训练集
df_balanced = pd.concat([df_pos, df_neg_sampled])

print("采样后样本分布：")
print(df_balanced["has_murder"].value_counts())

# 特征与标签
X = df_balanced[features]
y_cls = df_balanced["has_murder"]


采样后样本分布：
has_murder
1    77
0     7
Name: count, dtype: int64


In [29]:
# ========= Cell 2.1: 二阶段 XGBoost 训练 =========
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, r2_score
import xgboost as xgb

# 读取 CSV
df = pd.read_csv(input_dir)

# ========== 处理缺失值 ==========
df = df.fillna(0)

# ========== 设置因变量 ==========
# 阶段 1: 二分类标签 (有无谋杀)
df["has_murder"] = (df["murder_rate"] > 0).astype(int)
y_cls = df["has_murder"].values   # ✅ 分类因变量

# 阶段 2: 原始谋杀率（仅在 has_murder=1 时使用）
y_reg = df["murder_rate"].values


# ========== 自变量 ==========
feature_cols = [c for c in df.columns if c not in ["murder_rate", "has_murder", "barrio_name"]]  # 你可以把 barrio_name 替换为实际 ID 列名
features = ['fence_ratio', 'wall_ratio', 'road_ratio', 'sidewalk_ratio',
       'building_ratio', 'sky_ratio', 'vegetation_ratio', 'person_count',
       'bicyclist_count', 'motorcyclist_count', 'other_rider_count',
       'street_light_count', 'pop_density']
X = df[features]

# ========== 阶段 1: 训练分类器 ==========
from collections import Counter

# ========= 阶段 1: 训练分类器 =========
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# 计算类别比例
counter = Counter(y_train_cls)
scale_pos_weight = (counter[0] / counter[1]) * 2


clf = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,  # ✅ 处理样本不平衡
    random_state=42
)

clf.fit(X_train_cls, y_train_cls)

y_pred_cls = clf.predict(X_test_cls)
print("========= 分类器结果 =========")
print(confusion_matrix(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# ========== 阶段 2: 训练回归器 (只在有谋杀样本上) ==========
X_reg = X[y_cls == 1]
y_reg_sub = y_reg[y_cls == 1]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg_sub, test_size=0.2, random_state=42)

reg = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

reg.fit(X_train_reg, y_train_reg)

y_pred_reg = reg.predict(X_test_reg)

print("========= 回归器结果 =========")
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R²:", r2_score(y_test_reg, y_pred_reg))

# ========== 保存模型 ==========
os.makedirs(output_dir, exist_ok=True)
joblib.dump(clf, os.path.join(output_dir, "xgb_classifier.pkl"))
joblib.dump(reg, os.path.join(output_dir, "xgb_regressor.pkl"))

print(f"二阶段模型已保存到 {output_dir}")


[[79  1]
 [12  4]]
              precision    recall  f1-score   support

           0       0.87      0.99      0.92        80
           1       0.80      0.25      0.38        16

    accuracy                           0.86        96
   macro avg       0.83      0.62      0.65        96
weighted avg       0.86      0.86      0.83        96

MAE: 0.0023896360960373353
R²: 0.2507682259461538
二阶段模型已保存到 E:\Dissertation\XGBoost_cleaning\xgboost_result


In [31]:
# ========= Cell 2.1.1: 二阶段 XGBoost 训练（带欠采样修正） =========
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, r2_score
from sklearn.utils import resample
import xgboost as xgb
from collections import Counter

# 读取 CSV
df = pd.read_csv(input_dir)

# ========== 处理缺失值 ==========
df = df.fillna(0)

# ========== 设置因变量 ==========
df["has_murder"] = (df["murder_rate"] > 0).astype(int)

# 特征选择
features = [
    'fence_ratio', 'wall_ratio', 'road_ratio', 'sidewalk_ratio',
    'building_ratio', 'sky_ratio', 'vegetation_ratio', 'person_count',
    'bicyclist_count', 'motorcyclist_count', 'other_rider_count',
    'street_light_count', 'pop_density'
]

# ========== 欠采样处理 ==========
df_pos = df[df["has_murder"] == 1]   # 有谋杀的样本
df_neg = df[df["has_murder"] == 0]   # 无谋杀的样本

# 只随机抽取 7 个无谋杀样本
df_neg_sampled = resample(df_neg, n_samples=7, random_state=42)

# 合并成新的训练集
df_balanced = pd.concat([df_pos, df_neg_sampled])
print("采样后样本分布：")
print(df_balanced["has_murder"].value_counts())

# ========== 阶段 1: 训练分类器 ==========
X_cls = df_balanced[features]
y_cls = df_balanced["has_murder"]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# 计算类别比例
counter = Counter(y_train_cls)
scale_pos_weight = counter[0] / counter[1]

clf = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,  # 处理样本不平衡
    random_state=42
)

clf.fit(X_train_cls, y_train_cls)

y_pred_cls = clf.predict(X_test_cls)
print("========= 分类器结果 =========")
print(confusion_matrix(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# ========== 阶段 2: 训练回归器 (只在有谋杀样本上) ==========
df_reg = df_balanced[df_balanced["has_murder"] == 1]

X_reg = df_reg[features]
y_reg = df_reg["murder_rate"]

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

reg = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

reg.fit(X_train_reg, y_train_reg)

y_pred_reg = reg.predict(X_test_reg)

print("========= 回归器结果 =========")
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R²:", r2_score(y_test_reg, y_pred_reg))

# ========== 保存模型 ==========
os.makedirs(output_dir, exist_ok=True)
joblib.dump(clf, os.path.join(output_dir, "xgb_classifier.pkl"))
joblib.dump(reg, os.path.join(output_dir, "xgb_regressor.pkl"))

print(f"二阶段模型已保存到 {output_dir}")


采样后样本分布：
has_murder
1    77
0     7
Name: count, dtype: int64
[[ 1  0]
 [ 5 11]]
              precision    recall  f1-score   support

           0       0.17      1.00      0.29         1
           1       1.00      0.69      0.81        16

    accuracy                           0.71        17
   macro avg       0.58      0.84      0.55        17
weighted avg       0.95      0.71      0.78        17

MAE: 0.0023896360960373353
R²: 0.2507682259461538
二阶段模型已保存到 E:\Dissertation\XGBoost_cleaning\xgboost_result


In [14]:
# ========= Cell 2.2: XGBoost K 折交叉验证训练 =========
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
import xgboost as xgb

# ========= 配置 =========
target_col = "murder_rate"
features = [
    'fence_ratio', 'wall_ratio', 'road_ratio', 'sidewalk_ratio', 
    'building_ratio', 'sky_ratio', 'vegetation_ratio', 'person_count',
    'bicyclist_count', 'motorcyclist_count', 'other_rider_count',
    'street_light_count', 'pop_density'
]

# ========= 数据 =========
df = pd.read_csv(input_dir).fillna(0)
X = df[features]
y = df[target_col].values
sample_names = df["barrio"]

# 二阶段：分类目标
y_cls = (y > 0).astype(int)

# ========= KFold 设置 =========
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

cls_reports = []
reg_mae, reg_r2 = [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_cls)):
    print(f"\n===== Fold {fold+1}/{n_splits} =====")

    # Train/val 划分
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train_cls, y_val_cls = y_cls[train_idx], y_cls[val_idx]
    y_train_reg, y_val_reg = y[train_idx], y[val_idx]

    # ========= 分类器 =========
    clf = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    clf.fit(X_train, y_train_cls)
    y_pred_cls = clf.predict(X_val)
    report = classification_report(y_val_cls, y_pred_cls, output_dict=True)
    cls_reports.append(report)
    print("分类器结果：")
    print(classification_report(y_val_cls, y_pred_cls))

    # ========= 回归器 =========
    mask_train = y_train_cls == 1
    mask_val = y_val_cls == 1

    if mask_train.sum() > 0 and mask_val.sum() > 0:
        reg = xgb.XGBRegressor(
            n_estimators=500,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
        reg.fit(X_train[mask_train], y_train_reg[mask_train])
        y_pred_reg = reg.predict(X_val[mask_val])

        mae = mean_absolute_error(y_val_reg[mask_val], y_pred_reg)
        r2 = r2_score(y_val_reg[mask_val], y_pred_reg)
        reg_mae.append(mae)
        reg_r2.append(r2)

        print(f"回归器 MAE: {mae:.4f}, R²: {r2:.4f}")
    else:
        print("⚠️ 本折没有谋杀样本，跳过回归")

# ========= 汇总结果 =========
print("\n===== K 折交叉验证结果 =====")
print("分类器平均 recall(1):", np.mean([rep['1']['recall'] for rep in cls_reports]))
print("分类器平均 f1(1):", np.mean([rep['1']['f1-score'] for rep in cls_reports]))
if reg_mae:
    print("回归器 MAE 平均:", np.mean(reg_mae))
    print("回归器 R² 平均:", np.mean(reg_r2))



===== Fold 1/5 =====
分类器结果：
              precision    recall  f1-score   support

           0       0.83      0.93      0.88        80
           1       0.14      0.06      0.09        16

    accuracy                           0.78        96
   macro avg       0.49      0.49      0.48        96
weighted avg       0.72      0.78      0.74        96

回归器 MAE: 0.0091, R²: -4.8121

===== Fold 2/5 =====
分类器结果：
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        80
           1       0.17      0.07      0.10        15

    accuracy                           0.80        95
   macro avg       0.50      0.50      0.49        95
weighted avg       0.74      0.80      0.76        95

回归器 MAE: 0.0074, R²: -7.3111

===== Fold 3/5 =====
分类器结果：
              precision    recall  f1-score   support

           0       0.85      0.99      0.91        80
           1       0.50      0.07      0.12        15

    accuracy                         

In [None]:
# ===== 记录日志 =====
def append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note):
    repo_dir = 'E:\Dissertation\CASA0004'
    log_path = f"{repo_dir}/operation_log.md"

    # 写入日志
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"**任务名称**: {task_name}\n")
        f.write(f"**数据集**: {dataset}\n")
        f.write(f"**代码版本**: {code_version}\n")
        f.write(f"**输入目录**: {input_dir}\n")
        f.write(f"**输出目录**: {output_dir}\n")
        f.write(f"**状态**: {status}\n")
        f.write(f"**耗时**: {duration}\n")
        f.write(f"**备注**: {note}\n\n")

    print("✅ 日志写入完成")
status="finished"
append_log(task_name, dataset, code_version, input_dir, output_dir, status, duration, note)