In [3]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from deap import base, creator, tools, algorithms

In [61]:
pip install deap


Collecting deap
  Downloading deap-1.4.1.tar.gz (1.1 MB)
                                              0.0/1.1 MB ? eta -:--:--
     -----------------                        0.5/1.1 MB 9.8 MB/s eta 0:00:01
     ---------------------------------------  1.1/1.1 MB 11.2 MB/s eta 0:00:01
     ---------------------------------------- 1.1/1.1 MB 9.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: deap
  Building wheel for deap (setup.py): started
  Building wheel for deap (setup.py): finished with status 'done'
  Created wheel for deap: filename=deap-1.4.1-cp311-cp311-win_amd64.whl size=108738 sha256=73998635fb14272f4eef0b15b78c99ab7cc52d8e618b371d908526d2a94e6c36
  Stored in directory: c:\users\木木\appdata\local\pip\cache\wheels\f8\64\b8\65eacfbff3024ae2e2beb22e691d5c8abb89fbd863b8049b5f
Successfully built deap
Installing collected packages: deap
Successfully installed deap-1.4.1
No


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
data = pd.read_csv('sph6004_assignment1_data.csv')
# 对 gender 列进行 One-Hot Encoding
gender_encoded = pd.get_dummies(data['gender'], prefix='gender')

# 对 race 列进行 One-Hot Encoding
race_encoded = pd.get_dummies(data['race'], prefix='race')

# 将 One-Hot 编码后的结果添加到原始 DataFrame 中
data_encoded = pd.concat([data, gender_encoded, race_encoded], axis=1)

# 删除原始的 gender 和 race 列
data_encoded.drop(columns=['gender', 'race'], inplace=True)

# 计算每个变量的缺失值比例
missing_percentage = (data_encoded.isnull().sum() / len(data_encoded)) * 100

# 找出缺失值比例大于70%的变量
variables_to_drop = missing_percentage[missing_percentage > 70].index

# 删除缺失值大于70%的变量
df = data_encoded.drop(variables_to_drop, axis=1)
df.head()

Unnamed: 0,id,aki,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,36570066,3,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,...,0,0,0,0,0,0,0,0,0,0
1,39307659,0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,...,0,0,0,0,0,0,0,0,0,1
2,38743306,2,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,...,0,0,0,0,0,1,0,0,0,0
3,32339865,2,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,...,0,0,0,0,1,0,0,0,0,0
4,35526987,2,57.438861,57.0,100.0,82.387097,81.0,127.0,97.672131,47.0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
# 初始化KNNImputer对象
imputer = KNNImputer(n_neighbors=5)  # 可以根据需要调整n_neighbors的值

# 对DataFrame中的空白值进行填补
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [12]:
df_filled.head()

Unnamed: 0,id,aki,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,36570066.0,3.0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,39307659.0,0.0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,38743306.0,2.0,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,32339865.0,2.0,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,35526987.0,2.0,57.438861,57.0,100.0,82.387097,81.0,127.0,97.672131,47.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
# 初始化 StandardScaler
scaler = StandardScaler()

# 对数据进行标准化
df_final = scaler.fit_transform(df_filled)

# 将标准化后的数据重新转换为 DataFrame
df_final = pd.DataFrame(df_final, columns=df_filled.columns)

df_final['aki'] = df_final['aki'].replace({0: 0, 1: 0, 2: 0, 3: 1})

df_feature = df_filled.iloc[:, 1:]

df_feature.head()

Unnamed: 0,aki,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,dbp_max,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,3.0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.0,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,99.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.0,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,94.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2.0,57.438861,57.0,100.0,82.387097,81.0,127.0,97.672131,47.0,95.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [46]:
df_feature.head()

Unnamed: 0,aki,admission_age,heart_rate_min,heart_rate_max,heart_rate_mean,sbp_min,sbp_max,sbp_mean,dbp_min,dbp_max,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,1.0,79.953141,96.0,104.0,100.083333,103.0,126.0,116.136364,40.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,78.194169,72.0,134.0,97.263158,97.0,127.0,109.833333,56.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,65.602396,60.0,97.0,84.166667,95.0,143.0,112.153846,56.0,99.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,64.906629,59.0,87.0,71.461538,113.0,150.0,138.16,60.0,94.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,57.438861,57.0,100.0,82.387097,81.0,127.0,97.672131,47.0,95.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [47]:
#RFE特征提取
# 初始化逻辑回归模型
logreg = LogisticRegression()

# 初始化RFE对象，设置逻辑回归模型和要选择的特征数量
rfe = RFE(estimator=logreg, n_features_to_select=30)

# 对特征进行选择
rfe.fit(X, y)

# 获取选择的特征索引
selected_features_indices = rfe.support_

# 获取选择的特征
selected_features = X.columns[selected_features_indices]
selected_features

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Index(['aado2_calc_min', 'abs_eosinophils_min', 'abs_neutrophils_max',
       'admission_age', 'dbp_min', 'gcs_eyes', 'gender_M', 'glucose_max.1',
       'glucose_min.2', 'inr_max', 'inr_min', 'mbp_max', 'mbp_mean', 'mbp_min',
       'ph_max', 'ph_min', 'platelets_max', 'potassium_min.1',
       'race_AMERICAN INDIAN/ALASKA NATIVE', 'race_ASIAN - CHINESE',
       'race_ASIAN - KOREAN', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_HISPANIC OR LATINO',
       'race_PATIENT DECLINED TO ANSWER', 'race_UNABLE TO OBTAIN',
       'race_UNKNOWN', 'race_WHITE - EASTERN EUROPEAN', 'so2_max',
       'weight_admit'],
      dtype='object')

In [48]:
#L1正则化
model = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
model.fit(X, y)
selected_features_1 = X.columns[model.coef_[0]!=0]
selected_features_1

Index(['aado2_calc_min', 'abs_neutrophils_max', 'admission_age', 'dbp_min',
       'gcs_eyes', 'gender_M', 'glucose_max.1', 'glucose_min.2', 'inr_max',
       'inr_min', 'mbp_max', 'mbp_min', 'ph_min', 'platelets_max',
       'potassium_min.1', 'so2_max', 'weight_admit'],
      dtype='object')

In [25]:
#遗传算法
df_sampled = df_feature.sample(n=3000, random_state=42)

# 创建特征矩阵 X 和目标变量 y
X = df_sampled.drop(columns=['aki'])
y = df_sampled['aki'].copy()

# 定义适应度函数
def evaluate(individual, X, y):
    selected_features_0 = [bool(i) for i in individual]
    X_selected = X.iloc[:, selected_features_0]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred),

# 定义遗传算法相关参数
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.choice, [0, 1])
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate, X=X, y=y)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# 调整遗传算法参数
population_size = 100
num_generations = 30  # 增加迭代次数
cxpb = 0.3  # 降低交叉概率
mutpb = 0.4  # 增加变异概率

# 创建种群
population = toolbox.population(n=population_size)

# 运行遗传算法
for generation in range(num_generations):
    offspring = algorithms.varAnd(population, toolbox, cxpb=cxpb, mutpb=mutpb)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))

# 获取最优个体
best_individual = tools.selBest(population, k=1)[0]
selected_features = [bool(i) for i in best_individual]
selected_features_names = X.columns[selected_features]
print("Selected features_2:", selected_features_names)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features_2: Index(['admission_age', 'sbp_max', 'dbp_min', 'mbp_min', 'mbp_max', 'mbp_mean',
       'resp_rate_min', 'resp_rate_mean', 'spo2_mean', 'glucose_min',
       'lactate_min', 'lactate_max', 'ph_min', 'ph_max', 'so2_min', 'so2_max',
       'pco2_max', 'aado2_calc_min', 'pao2fio2ratio_min', 'baseexcess_max',
       'totalco2_min', 'calcium_min', 'glucose_max.1', 'hematocrit_min.1',
       'hemoglobin_max.1', 'platelets_max', 'albumin_min', 'aniongap_min',
       'bicarbonate_min.1', 'bun_min', 'calcium_min.1', 'glucose_min.2',
       'sodium_max.1', 'potassium_min.1', 'potassium_max.1',
       'abs_basophils_min', 'abs_eosinophils_min', 'abs_lymphocytes_min',
       'abs_lymphocytes_max', 'abs_monocytes_min', 'abs_neutrophils_max',
       'inr_min', 'inr_max', 'pt_min', 'ptt_max', 'alp_max',
       'bilirubin_total_max', 'ck_cpk_min', 'ck_cpk_max', 'gcs_eyes',
       'weight_admit', 'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE',
       'race_ASIAN - ASIAN INDIAN', 'r

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [69]:
print("Number of selected features:", len(selected_features_2))

Number of selected features: 143


In [30]:
selected_features_2_heri = ['admission_age', 'sbp_max', 'dbp_min', 'mbp_min', 'mbp_max', 'mbp_mean',
       'resp_rate_min', 'resp_rate_mean', 'spo2_mean', 'glucose_min',
       'lactate_min', 'lactate_max', 'ph_min', 'ph_max', 'so2_min', 'so2_max',
       'pco2_max', 'aado2_calc_min', 'pao2fio2ratio_min', 'baseexcess_max',
       'totalco2_min', 'calcium_min', 'glucose_max.1', 'hematocrit_min.1',
       'hemoglobin_max.1', 'platelets_max', 'albumin_min', 'aniongap_min',
       'bicarbonate_min.1', 'bun_min', 'calcium_min.1', 'glucose_min.2',
       'sodium_max.1', 'potassium_min.1', 'potassium_max.1',
       'abs_basophils_min', 'abs_eosinophils_min', 'abs_lymphocytes_min',
       'abs_lymphocytes_max', 'abs_monocytes_min', 'abs_neutrophils_max',
       'inr_min', 'inr_max', 'pt_min', 'ptt_max', 'alp_max',
       'bilirubin_total_max', 'ck_cpk_min', 'ck_cpk_max', 'gcs_eyes',
       'weight_admit', 'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE',
       'race_ASIAN - ASIAN INDIAN', 'race_ASIAN - CHINESE',
       'race_ASIAN - KOREAN', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_HISPANIC OR LATINO',
       'race_HISPANIC/LATINO - CENTRAL AMERICAN',
       'race_HISPANIC/LATINO - CUBAN', 'race_HISPANIC/LATINO - HONDURAN',
       'race_HISPANIC/LATINO - MEXICAN', 'race_HISPANIC/LATINO - SALVADORAN',
       'race_MULTIPLE RACE/ETHNICITY',
       'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
       'race_PATIENT DECLINED TO ANSWER', 'race_SOUTH AMERICAN',
       'race_UNABLE TO OBTAIN', 'race_UNKNOWN', 'race_WHITE',
       'race_WHITE - EASTERN EUROPEAN', 'race_WHITE - RUSSIAN']
selected_features_3_genetic = ['admission_age', 'sbp_min', 'sbp_mean', 'dbp_min', 'dbp_max', 'mbp_min',
       'mbp_max', 'mbp_mean', 'temperature_min', 'temperature_max',
       'temperature_mean', 'spo2_max', 'glucose_min', 'ph_min', 'ph_max',
       'so2_max', 'aado2_calc_min', 'calcium_max', 'glucose_min.1',
       'glucose_max.1', 'potassium_min', 'platelets_max', 'albumin_max',
       'glucose_min.2', 'glucose_max.2', 'sodium_min.1', 'potassium_min.1',
       'abs_basophils_max', 'abs_eosinophils_min', 'abs_eosinophils_max',
       'abs_neutrophils_max', 'inr_min', 'inr_max', 'pt_max', 'ast_min',
       'gcs_min', 'gcs_verbal', 'gcs_eyes', 'weight_admit', 'gender_F',
       'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE', 'race_ASIAN',
       'race_ASIAN - ASIAN INDIAN', 'race_ASIAN - CHINESE',
       'race_ASIAN - KOREAN', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_BLACK/AFRICAN AMERICAN',
       'race_BLACK/CAPE VERDEAN', 'race_BLACK/CARIBBEAN ISLAND',
       'race_HISPANIC OR LATINO', 'race_HISPANIC/LATINO - CENTRAL AMERICAN',
       'race_HISPANIC/LATINO - COLUMBIAN', 'race_HISPANIC/LATINO - CUBAN',
       'race_HISPANIC/LATINO - DOMINICAN', 'race_HISPANIC/LATINO - GUATEMALAN',
       'race_HISPANIC/LATINO - HONDURAN', 'race_HISPANIC/LATINO - MEXICAN',
       'race_HISPANIC/LATINO - PUERTO RICAN',
       'race_HISPANIC/LATINO - SALVADORAN', 'race_MULTIPLE RACE/ETHNICITY',
       'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'race_OTHER',
       'race_PATIENT DECLINED TO ANSWER', 'race_PORTUGUESE',
       'race_SOUTH AMERICAN', 'race_UNABLE TO OBTAIN', 'race_UNKNOWN',
       'race_WHITE - BRAZILIAN', 'race_WHITE - EASTERN EUROPEAN']

selected_features_1_L1 = ['admission_age', 'heart_rate_min', 'heart_rate_max', 'heart_rate_mean',
       'sbp_min', 'sbp_max', 'sbp_mean', 'dbp_min', 'dbp_max', 'dbp_mean',
       'mbp_min', 'mbp_max', 'mbp_mean', 'resp_rate_min', 'resp_rate_max',
       'resp_rate_mean', 'temperature_min', 'spo2_min', 'spo2_max',
       'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean',
       'lactate_min', 'so2_min', 'so2_max', 'po2_min', 'po2_max', 'pco2_min',
       'pco2_max', 'aado2_calc_min', 'aado2_calc_max', 'pao2fio2ratio_min',
       'pao2fio2ratio_max', 'baseexcess_min', 'baseexcess_max', 'totalco2_min',
       'totalco2_max', 'glucose_min.1', 'glucose_max.1', 'potassium_max',
       'hematocrit_min.1', 'hematocrit_max.1', 'hemoglobin_min.1',
       'hemoglobin_max.1', 'platelets_min', 'platelets_max', 'wbc_min',
       'wbc_max', 'albumin_min', 'albumin_max', 'aniongap_min', 'aniongap_max',
       'bicarbonate_min.1', 'bicarbonate_max.1', 'bun_min', 'bun_max',
       'calcium_min.1', 'calcium_max.1', 'chloride_max.1', 'glucose_min.2',
       'glucose_max.2', 'sodium_min.1', 'sodium_max.1', 'potassium_max.1',
       'abs_lymphocytes_min', 'abs_neutrophils_min', 'abs_neutrophils_max',
       'pt_min', 'pt_max', 'ptt_min', 'ptt_max', 'alt_min', 'alt_max',
       'alp_min', 'alp_max', 'ast_min', 'ast_max', 'bilirubin_total_max',
       'ck_cpk_min', 'ck_cpk_max', 'gcs_min', 'gcs_motor', 'gcs_verbal',
       'gcs_eyes', 'height', 'weight_admit', 'gender_M']

selected_features_0_RFE = ['sbp_mean', 'dbp_min', 'dbp_mean', 'mbp_mean', 'resp_rate_mean',
       'temperature_min', 'spo2_mean', 'lactate_min', 'hemoglobin_min.1',
       'hemoglobin_max.1', 'albumin_min', 'albumin_max', 'aniongap_max',
       'bicarbonate_min.1', 'bun_max', 'calcium_min.1', 'chloride_max.1',
       'sodium_max.1', 'potassium_max.1', 'abs_neutrophils_min',
       'abs_neutrophils_max', 'inr_max', 'bilirubin_total_min',
       'bilirubin_total_max', 'gcs_verbal', 'gcs_eyes', 'gcs_unable', 'height',
       'weight_admit', 'gender_M']

# 将四个特征列表转换为集合
selected_features_2_set = set(selected_features_2_list)
selected_features_3_set = set(selected_features_3_list)
selected_features_1_set = set(selected_features_1_list)
selected_features_0_set = set(selected_features_0_list)

# 找到四个集合的交集
overlap_features = selected_features_2_set.intersection(selected_features_3_set)

# 打印交集
print("重叠特征：", overlap_features)


重叠特征： {'race_HISPANIC/LATINO - CUBAN', 'race_WHITE - EASTERN EUROPEAN', 'race_ASIAN - ASIAN INDIAN', 'race_SOUTH AMERICAN', 'ph_max', 'mbp_min', 'potassium_min.1', 'mbp_mean', 'inr_min', 'glucose_min', 'glucose_min.2', 'race_BLACK/AFRICAN', 'aado2_calc_min', 'dbp_min', 'gcs_eyes', 'race_UNKNOWN', 'race_ASIAN - SOUTH EAST ASIAN', 'race_ASIAN - CHINESE', 'gender_M', 'race_HISPANIC OR LATINO', 'race_HISPANIC/LATINO - HONDURAN', 'race_AMERICAN INDIAN/ALASKA NATIVE', 'race_ASIAN - KOREAN', 'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'glucose_max.1', 'ph_min', 'abs_eosinophils_min', 'so2_max', 'race_HISPANIC/LATINO - MEXICAN', 'mbp_max', 'abs_neutrophils_max', 'weight_admit', 'race_HISPANIC/LATINO - CENTRAL AMERICAN', 'platelets_max', 'race_HISPANIC/LATINO - SALVADORAN', 'admission_age', 'race_UNABLE TO OBTAIN', 'inr_max', 'race_MULTIPLE RACE/ETHNICITY', 'race_PATIENT DECLINED TO ANSWER'}


In [31]:
len(overlap_features)

40

In [33]:
overlap_features_list = sorted(list(overlap_features))

In [43]:
X = df_feature[overlap_features_list]

# 创建目标变量 y
y = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化逻辑回归模型
logreg = LogisticRegression()

# 训练模型
logreg.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = logreg.predict(X_test)

# 计算模型的准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型准确率:", accuracy)

# 打印分类报告
print("分类报告:")
print(classification_report(y_test, y_pred))

模型准确率: 0.844854673998429
分类报告:
              precision    recall  f1-score   support

         0.0       0.85      0.99      0.92      8590
         1.0       0.57      0.04      0.07      1594

    accuracy                           0.84     10184
   macro avg       0.71      0.52      0.49     10184
weighted avg       0.80      0.84      0.78     10184



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
# 根据选择的特征创建特征矩阵 X
X1 = df_feature[selected_features]

# 创建目标变量 y
y1 = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# 初始化逻辑回归模型
logreg = LogisticRegression()

# 训练模型
logreg.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = logreg.predict(X_test)

# 计算模型的准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型准确率:", accuracy)


模型准确率: 0.844854673998429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
# 特征矩阵
X = df_feature.drop(columns=['aki'])

# 目标变量
y = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立决策树模型
clf = DecisionTreeClassifier()

# 使用基于决策树的特征选择方法
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# 获取选择的特征索引
selected_features_indices = selector.get_support()

# 根据选择的特征索引筛选特征
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# 训练模型
clf.fit(X_train_selected, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test_selected)

# 计算模型准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型在测试集上的准确率:", accuracy)
# 打印分类报告
print("分类报告:")
print(classification_report(y_test, y_pred))

模型在测试集上的准确率: 0.7800471327572663
分类报告:
              precision    recall  f1-score   support

         0.0       0.87      0.86      0.87      8590
         1.0       0.31      0.33      0.32      1594

    accuracy                           0.78     10184
   macro avg       0.59      0.60      0.59     10184
weighted avg       0.79      0.78      0.78     10184



In [44]:
# 特征矩阵
X = df_feature.drop(columns=['aki'])

# 目标变量
y = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立随机森林模型
clf = RandomForestClassifier()

# 使用基于随机森林的特征选择方法
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# 获取选择的特征索引
selected_features_indices = selector.get_support()

# 根据选择的特征索引筛选特征
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# 训练模型
clf.fit(X_train_selected, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test_selected)

# 计算模型准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型在测试集上的准确率:", accuracy)

模型在测试集上的准确率: 0.8570306362922231


In [46]:
# 特征矩阵
X = df_feature.drop(columns=['aki'])

# 目标变量
y = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立AdaBoost模型
clf = AdaBoostClassifier()

# 使用基于AdaBoost的特征选择方法
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# 获取选择的特征索引
selected_features_indices = selector.get_support()

# 根据选择的特征索引筛选特征
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# 训练模型
clf.fit(X_train_selected, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test_selected)

# 计算模型准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型在测试集上的准确率:", accuracy)



模型在测试集上的准确率: 0.8562450903377847


In [48]:
# 特征矩阵
X = df_feature.drop(columns=['aki'])

# 目标变量
y = df_feature['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立 Gradient Boosting Tree 模型
clf = GradientBoostingClassifier()

# 使用基于 Gradient Boosting Tree 的特征选择方法
selector = SelectFromModel(estimator=clf)
selector.fit(X_train, y_train)

# 获取选择的特征索引
selected_features_indices = selector.get_support()

# 根据选择的特征索引筛选特征
X_train_selected = X_train.iloc[:, selected_features_indices]
X_test_selected = X_test.iloc[:, selected_features_indices]

# 训练模型
clf.fit(X_train_selected, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test_selected)

# 计算模型准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型在测试集上的准确率:", accuracy)

模型在测试集上的准确率: 0.8619402985074627


In [75]:
# 随机抽取5000个样本
sampled_df = df_feature.sample(n=1000, random_state=42)

# 特征矩阵
X = sampled_df.drop(columns=['aki'])

# 目标变量
y = sampled_df['aki']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化特征列表和最佳特征集
selected_features = []
best_score = 0

# 创建随机森林分类器
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# 进行前向特征选择
for feature in X.columns:
    # 加入一个新的特征
    selected_features.append(feature)
    
    # 使用新的特征集合进行训练
    clf.fit(X_train[selected_features], y_train)
    
    # 在测试集上进行预测
    y_pred = clf.predict(X_test[selected_features])
    
    # 计算准确率
    score = accuracy_score(y_test, y_pred)
    
    # 如果准确率提升了，更新最佳特征集和最佳准确率
    if score > best_score:
        best_score = score
        best_features = selected_features.copy()
    else:
        # 如果准确率没有提升，则移除这个新加入的特征
        selected_features.remove(feature)

# 输出最佳特征集和准确率
print("最佳特征集：", best_features)
print("最佳准确率：", best_score)

最佳特征集： ['admission_age', 'heart_rate_min', 'heart_rate_mean', 'sbp_min', 'sbp_max', 'mbp_min', 'baseexcess_min', 'ptt_min', 'alp_min', 'race_ASIAN - ASIAN INDIAN']
最佳准确率： 0.815
