Hello World! Feel Free to play around for testing.

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: 加载数据
df_y2 = pd.read_csv('../data/train_test/rf_data_Y2.csv')
df_y3 = pd.read_csv('../data/train_test/rf_data_Y3.csv')

# 假设 'DaysInHospital' 是标签列，'MemberID' 不参与训练
X_y2 = df_y2.drop(['DaysInHospital', 'MemberID'], axis=1)
y_y2 = df_y2['DaysInHospital']

X_y3 = df_y3.drop(['DaysInHospital', 'MemberID'], axis=1)
y_y3 = df_y3['DaysInHospital']

# Step 2: 实现k折交叉验证
def k_fold_cross_validation(X, y, k, model):
    np.random.seed(42)
    fold_size = len(X) // k
    indices = np.random.permutation(len(X))
    mse_scores = []

    for i in range(k):
        val_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores), np.std(mse_scores)

# Step 3: 实现嵌套交叉验证
def nested_cross_validation(X, y, outer_k=3, inner_k=3, param_grid=None):
    if param_grid is None:
        param_grid = {'n_estimators': [10, 50, 100]}  # 这里只调整 n_estimators

    outer_mse_scores = []
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size_outer = len(X) // outer_k

    for i in range(outer_k):
        val_indices_outer = indices[i * fold_size_outer:(i + 1) * fold_size_outer]
        train_indices_outer = np.concatenate([indices[:i * fold_size_outer], indices[(i + 1) * fold_size_outer:]])

        X_train_outer, X_val_outer = X.iloc[train_indices_outer], X.iloc[val_indices_outer]
        y_train_outer, y_val_outer = y.iloc[train_indices_outer], y.iloc[val_indices_outer]

        best_mse = float('inf')
        best_params = None

        for n_estimators in param_grid['n_estimators']:
            model = RandomForestRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1)
            mean_mse, _ = k_fold_cross_validation(X_train_outer, y_train_outer, inner_k, model)

            if mean_mse < best_mse:
                best_mse = mean_mse
                best_params = {'n_estimators': n_estimators}

        final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
        final_model.fit(X_train_outer, y_train_outer)

        y_pred_outer = final_model.predict(X_val_outer)
        mse_outer = mean_squared_error(y_val_outer, y_pred_outer)
        outer_mse_scores.append(mse_outer)

    return np.mean(outer_mse_scores), np.std(outer_mse_scores), best_params

# Step 4: 进行20次独立实验，并计算均值和方差
def run_multiple_experiments(X, y, repetitions=20, outer_k=3, inner_k=3, param_grid=None):
    mse_results = []
    best_param_list = []
    for i in range(repetitions):
        print(f"Running repetition {i+1}/{repetitions}")
        mean_mse, std_mse, best_params = nested_cross_validation(X, y, outer_k=outer_k, inner_k=inner_k, param_grid=param_grid)
        mse_results.append(mean_mse)
        best_param_list.append(best_params)

    return np.mean(mse_results), np.std(mse_results), best_param_list

# Step 5: 执行嵌套交叉验证实验
param_grid = {'n_estimators': [50, 100, 200]}
mean_mse, std_mse, best_params_list = run_multiple_experiments(X_y2, y_y2, repetitions=5, param_grid=param_grid)

# 输出结果
print(f"Mean MSE after 5 repetitions: {mean_mse}")
print(f"Standard deviation of MSE after 5 repetitions: {std_mse}")
print(f"Best Parameters: {best_params_list}")

# Step 6: 使用最优模型对第三年数据进行预测并评估
best_params = best_params_list[-1]  # 使用最后一次实验的最佳参数
final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X_y2, y_y2)

y_pred_y3 = final_model.predict(X_y3)

# Step 7: 评估模型在第三年数据上的表现
mse_y3 = mean_squared_error(y_y3, y_pred_y3)
print(f"Mean Squared Error on Year 3 data: {mse_y3}")



In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/processed/merged_data_Y2.csv')
df

In [None]:
# 移除不用于预测的列，例如 MemberID
X = df.drop(columns=['MemberID', 'DaysInHospital'])

# 将 DaysInHospital 作为标签
y = df['DaysInHospital']
X

In [None]:
y 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. 特征选择之前的模型
# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 评估特征选择之前的模型性能
pre_selection_score = rf.score(X_test, y_test)
print(f"Model accuracy before feature selection: {pre_selection_score}")

# 2. 使用SelectFromModel基于特征重要性进行特征选择
selector = SelectFromModel(rf, threshold="mean")  # 选择特征重要性高于平均值的特征
selector.fit(X_train, y_train)

# 获取选择的特征
selected_features = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_features]

# 打印被选择的特征名称
print("Selected features:")
print(selected_feature_names)

# 提取选择后的特征
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# 3. 特征选择后的模型
# 使用选择后的特征重新训练模型
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# 评估特征选择后的模型性能
post_selection_score = rf_selected.score(X_test_selected, y_test)
print(f"Model accuracy after feature selection: {post_selection_score}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: 加载数据
df = pd.read_csv('../data/processed/merged_data_Y2.csv')  # 请根据需要调整路径
X = df.drop(['DaysInHospital', 'MemberID'], axis=1)  # X是所有的特征，移除标签列和MemberID列
y = df['DaysInHospital']  # y是目标标签DaysInHospital列

# Step 2: 实现k折交叉验证
def k_fold_cross_validation(X, y, k, model):
    """
    实现k折交叉验证。
    
    参数：
    X：特征数据集。
    y：标签数据集。
    k：折数（k-fold的k）。
    model：要训练的模型（在每个折上训练的模型）。

    返回值：
    返回均方误差(MSE)的平均值和标准差。
    """
    np.random.seed(42)  # 设置随机种子，保证实验结果可复现
    fold_size = len(X) // k  # 每折的大小，数据集的长度除以k
    indices = np.random.permutation(len(X))  # 将数据集的索引随机打乱
    mse_scores = []  # 用于存储每一折的MSE

    # 进行k折交叉验证
    for i in range(k):
        val_indices = indices[i * fold_size: (i + 1) * fold_size]  # 当前验证集的索引
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])  # 剩余部分作为训练集

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]  # 根据索引分割训练集和验证集
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        # 在训练集上训练模型
        model.fit(X_train, y_train)
        # 预测验证集
        y_pred = model.predict(X_val)
        # 计算验证集的均方误差
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    # 返回均方误差的均值和标准差
    return np.mean(mse_scores), np.std(mse_scores)

# Step 3: 实现嵌套交叉验证
def nested_cross_validation(X, y, outer_k=5, inner_k=3, param_grid=None):
    """
    实现嵌套交叉验证，用于超参数调优和模型评估。
    
    参数：
    X：特征数据集。
    y：标签数据集。
    outer_k：外层k折交叉验证的折数。
    inner_k：内层k折交叉验证的折数，用于超参数调优。
    param_grid：超参数网格，默认为None，如果没有指定将使用默认的参数网格。

    返回值：
    返回外层验证集上均方误差(MSE)的平均值和标准差。
    """
    if param_grid is None:
        # 如果没有指定超参数网格，使用默认参数
        param_grid = {'n_estimators': [10, 50], 'max_depth': [5, 10, None]}

    outer_mse_scores = []  # 存储外层验证集的MSE

    np.random.seed(42)  # 设置随机种子，保证实验结果可复现
    indices = np.random.permutation(len(X))  # 随机打乱数据集索引
    fold_size_outer = len(X) // outer_k  # 外层每折的大小

    # 进行外层k折交叉验证
    for i in range(outer_k):
        # 外层的验证集索引
        val_indices_outer = indices[i * fold_size_outer:(i + 1) * fold_size_outer]
        # 外层的训练集索引
        train_indices_outer = np.concatenate([indices[:i * fold_size_outer], indices[(i + 1) * fold_size_outer:]])

        X_train_outer, X_val_outer = X.iloc[train_indices_outer], X.iloc[val_indices_outer]  # 根据索引分割外层训练集和验证集
        y_train_outer, y_val_outer = y.iloc[train_indices_outer], y.iloc[val_indices_outer]

        # 内层交叉验证进行超参数选择
        best_mse = float('inf')  # 记录当前最好的MSE
        best_params = None  # 记录当前最优的超参数组合

        # 遍历超参数网格，进行内层k折交叉验证
        for n_estimators in param_grid['n_estimators']:
            for max_depth in param_grid['max_depth']:
                model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

                # 在内层进行k折交叉验证
                mean_mse, _ = k_fold_cross_validation(X_train_outer, y_train_outer, inner_k, model)

                # 如果当前组合的MSE优于之前的结果，更新最佳参数
                if mean_mse < best_mse:
                    best_mse = mean_mse
                    best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}

        # 使用最优超参数在外层训练集上训练最终模型
        final_model = RandomForestRegressor(**best_params, random_state=42)
        final_model.fit(X_train_outer, y_train_outer)

        # 在外层验证集上评估模型表现
        y_pred_outer = final_model.predict(X_val_outer)
        mse_outer = mean_squared_error(y_val_outer, y_pred_outer)  # 计算MSE
        outer_mse_scores.append(mse_outer)

    # 返回外层验证集MSE的均值和标准差
    return np.mean(outer_mse_scores), np.std(outer_mse_scores)

# Step 4: 进行20次独立实验，并计算均值和方差
def run_multiple_experiments(X, y, repetitions=20, outer_k=5, inner_k=3, param_grid=None):
    """
    执行多次嵌套交叉验证实验，并计算每次实验的MSE均值和方差。
    
    参数：
    X：特征数据集。
    y：标签数据集。
    repetitions：独立实验的次数。
    outer_k：外层k折交叉验证的折数。
    inner_k：内层k折交叉验证的折数，用于超参数调优。
    param_grid：超参数网格，默认为None，如果没有指定将使用默认的参数网格。

    返回值：
    返回所有实验的MSE均值和方差。
    """
    mse_results = []  # 存储每次实验的MSE结果
    for i in range(repetitions):
        print(f"Running repetition {i+1}/{repetitions}")
        # 运行嵌套交叉验证，计算MSE
        mean_mse, std_mse = nested_cross_validation(X, y, outer_k=outer_k, inner_k=inner_k, param_grid=param_grid)
        mse_results.append(mean_mse)

    # 返回所有实验的MSE均值和方差
    return np.mean(mse_results), np.std(mse_results)

# Step 5: 执行20次嵌套交叉验证实验
mean_mse, std_mse = run_multiple_experiments(X, y, repetitions=20)

# 输出结果
print(f"Mean MSE after 20 repetitions: {mean_mse}")
print(f"Standard deviation of MSE after 20 repetitions: {std_mse}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# 假设df是数据集，DaysInHospital是标签列，MemberID不参与训练
df = pd.read_csv('../data/processed/merged_data_Y2.csv')  # 请根据需要调整路径
X = df.drop(['DaysInHospital', 'MemberID'], axis=1)
y = df['DaysInHospital']

# 自定义实现交叉验证
def cross_val_score_manual(X, y, model, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
    
    return np.mean(mse_scores), np.std(mse_scores)

# 超参数调优的函数，遍历不同的超参数组合
def grid_search_manual(X, y, param_grid, k=5):
    best_params = None
    best_score = float('inf')
    
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            mean_mse, std_mse = cross_val_score_manual(X, y, model, k)
            
            print(f"n_estimators: {n_estimators}, max_depth: {max_depth}, Mean MSE: {mean_mse}, Std MSE: {std_mse}")
            
            if mean_mse < best_score:
                best_score = mean_mse
                best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}
    
    return best_params, best_score

# 嵌套交叉验证
def nested_cross_val(X, y, param_grid, outer_k=5, inner_k=5, repetitions=20):
    outer_kf = KFold(n_splits=outer_k, shuffle=True, random_state=42)
    all_mse_scores = []

    for i in range(repetitions):
        mse_scores = []
        
        for train_index, test_index in outer_kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            # 在训练集上进行内层交叉验证以找到最佳超参数
            best_params, _ = grid_search_manual(X_train, y_train, param_grid, inner_k)
            
            # 使用最佳超参数训练最终模型并在测试集上评估
            best_model = RandomForestRegressor(**best_params, random_state=42)
            best_model.fit(X_train, y_train)
            y_pred = best_model.predict(X_test)
            
            mse = mean_squared_error(y_test, y_pred)
            mse_scores.append(mse)
        
        all_mse_scores.append(np.mean(mse_scores))
        print(f"Repetition {i+1}/{repetitions}: Mean MSE = {np.mean(mse_scores)}")
    
    return np.mean(all_mse_scores), np.std(all_mse_scores)

# 定义超参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]
}

# 进行嵌套交叉验证，20次独立重复实验
mean_mse, std_mse = nested_cross_val(X, y, param_grid, outer_k=5, inner_k=5, repetitions=20)

print(f"Nested Cross-Validation Mean MSE: {mean_mse} ± {std_mse}")
