In [2]:
import pandas as pd

data = pd.read_csv('data/SEER处理后数据.csv')
data.head(10)

Unnamed: 0,Sex,Race,Age,Year,Histologic Type,Grade,Stage Group,tumor size,mets at dx,Surg Prim Site,Survival months
0,0,2,72,2010,8500,3,8,80,0,80,12
1,0,1,52,2015,8522,2,7,15,0,22,72
2,0,1,82,2012,8500,2,3,12,0,22,111
3,0,1,57,2014,8500,2,3,19,0,45,93
4,0,3,67,2014,8140,2,12,50,0,0,90
5,0,1,72,2013,8500,2,3,5,0,23,98
6,0,1,85,2015,8500,3,13,72,0,51,7
7,0,1,72,2011,8261,2,11,15,0,40,122
8,0,1,82,2015,8550,3,8,28,0,33,52
9,1,1,67,2011,8246,2,14,12,40,30,120


In [3]:
def map_interval(x):
    if x <= 36:
        return 1
    elif 36 < x <= 60:
        return 2
    elif 60 < x <= 120:
        return 3
    else:
        return 4
data['Survival range'] = data['Survival months'].apply(map_interval)

In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler


# 标签
y = np.array(data['Survival range'])

# 在特征中去掉标签
X = data.drop('Survival months', axis = 1)
X = X.drop('Survival range', axis = 1)
# X = X.drop('Race', axis = 1)
# X = X.drop('Year', axis = 1)
# X = X.drop('Primary Site', axis = 1)
# X = pd.get_dummies(X)
# X.head()


# 名字单独保存一下，以备后患
X_names = X.columns

scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)

# 转换成合适的格式
X = np.array(X)

In [5]:
# 数据集切分
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 25)

print('训练集特征:', train_X.shape)
print('训练集标签:', train_y.shape)
print('测试集特征:', test_X.shape)
print('测试集标签:', test_y.shape)

训练集特征: (631352, 10)
训练集标签: (631352,)
测试集特征: (210451, 10)
测试集标签: (210451,)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

custom_weights = {
    1: 1.6,  # 正常权重
    2: 5.0,  # 增强关注
    3: 1.0,  # 最大类别，不增强
    4: 3.0   # 极少类别，强增强
}

model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=10, class_weight='balanced', random_state=25)
# model = LinearRegression()
# model = MLPClassifier(solver='adam', hidden_layer_sizes=(50, 50) , activation='tanh', random_state=25, max_iter=200)

model.fit(train_X, train_y)

In [7]:
# from sklearn import metrics
# from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

# 预测结果
pred_y = model.predict(test_X)
print(pred_y[0])
print(test_X[0])

print('Accuracy:{:.2f}%'.format(accuracy_score(test_y, pred_y)*100))
print(model.predict([[1,1,37,2012,8140,1,1,2,0,80]]))

3
[ 1.3445541  -0.44967929  1.3754686   0.25005342  0.01245311  0.9249041
 -1.14339074 -0.04268506 -0.33272219 -0.07065266]
Accuracy:70.00%
[1]


In [8]:
# # 假设你已经训练好了模型 rf，且 X_train 是 DataFrame
# importances = model.feature_importances_

# # 按重要度排序
# indices = np.argsort(importances)[::-1]
# sorted_names = X_names[indices]
# sorted_importances = importances[indices]

# # 输出排序结果
# for name, imp in zip(sorted_names, sorted_importances):
#     print(f"{name}: {imp:.4f}")

In [9]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    balanced_accuracy_score,
    cohen_kappa_score,
    matthews_corrcoef,
    roc_auc_score
)
import numpy as np

# 1. 准确率
acc = accuracy_score(test_y, pred_y)
print(f"Accuracy: {acc:.4f}")

# 2. 精确率 / 召回率 / F1
for avg in ['macro', 'micro', 'weighted']:
    p = precision_score(test_y, pred_y, average=avg, labels=[1,2,3,4],zero_division=0)
    r = recall_score(test_y, pred_y, average=avg, labels=[1,2,3,4],zero_division=0)
    f1 = f1_score(test_y, pred_y, average=avg, labels=[1,2,3,4],zero_division=0)
    print(f"{avg.capitalize():<8} Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")

# 3. 详细报告
print("\nClassification Report:\n")
print(classification_report(test_y, pred_y, labels=[1,2,3,4],zero_division=0))

# 4. 混淆矩阵
cm = confusion_matrix(test_y, pred_y, labels=[1,2,3,4])
print("Confusion Matrix:\n", cm)

# 5. 平衡准确率（对不平衡数据更鲁棒）
bal_acc = balanced_accuracy_score(test_y, pred_y)
print(f"Balanced Accuracy: {bal_acc:.4f}")

# 6. Cohen's Kappa（测量标注者一致性）
kappa = cohen_kappa_score(test_y, pred_y)
print(f"Cohen's Kappa: {kappa:.4f}")

# 7. Matthews 相关系数（整体性能指标，-1 到 +1）
mcc = matthews_corrcoef(test_y, pred_y)
print(f"Matthews Correlation Coefficient: {mcc:.4f}")


Accuracy: 0.7000
Macro    Precision: 0.5724, Recall: 0.6123, F1: 0.5849
Micro    Precision: 0.7000, Recall: 0.7000, F1: 0.7000
Weighted Precision: 0.6952, Recall: 0.7000, F1: 0.6930

Classification Report:

              precision    recall  f1-score   support

           1       0.66      0.66      0.66     57224
           2       0.17      0.12      0.14     17729
           3       0.82      0.75      0.79    103895
           4       0.64      0.91      0.75     31603

    accuracy                           0.70    210451
   macro avg       0.57      0.61      0.58    210451
weighted avg       0.70      0.70      0.69    210451

Confusion Matrix:
 [[38014  3886 10342  4982]
 [ 6043  2076  6656  2954]
 [11748  5583 78354  8210]
 [ 1772   880    79 28872]]
Balanced Accuracy: 0.6123
Cohen's Kappa: 0.5486
Matthews Correlation Coefficient: 0.5513


In [10]:
import pickle #调用“腌制”库

model_filename = 'forest-model.pkl'#设定文件名
scaler_filename = 'forest-scaler.pkl'
pickle.dump(model, open(model_filename,'wb'))#对模型进行“腌制”
pickle.dump(scaler_x, open(scaler_filename, 'wb'))

load_model = pickle.load(open(model_filename,'rb'))#加载“腌制”好的模型
load_scaler = pickle.load(open(scaler_filename,'rb'))

new_x = [[0,1,72,2011,8261,2,11,15,0,40]]
new_scaled_x = load_scaler.transform(new_x)
print(model.predict(new_scaled_x))

[4]


