xgb with **notes**

In [None]:
import pandas as pd
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Set the random seed for reproducibility
RANDOM_SEED = 3612

# Step 1: Load and inspect the data
print("Loading notes data...")
notes_df = pd.read_csv('notes.csv')

# Step 2: Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

notes_df['processed_text'] = notes_df['text'].apply(preprocess_text)

# Step 3: Extract TF-IDF features
print("Extracting TF-IDF features...")
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 3))  # Expand the n-gram range
text_features = tfidf.fit_transform(notes_df['processed_text']).toarray()

# Save the extracted features
text_features_df = pd.DataFrame(text_features, columns=tfidf.get_feature_names_out())
text_features_df['id'] = notes_df['id']
text_features_df.to_csv('notes_features_tfidf.csv', index=False)

# Step 4: Load training and testing data
print("Loading train and test data...")
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Merge the text features with train and test data
train_data = train_data.merge(text_features_df, on='id', how='left')
test_data = test_data.merge(text_features_df, on='id', how='left')

# Fill missing values with zero
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Step 5: Prepare features and labels
feature_cols = [col for col in train_data.columns if col not in ['id', 'readmitted_within_30days', 'text', 'processed_text']]

# Check if all feature columns are numeric
non_numeric_cols = [col for col in feature_cols if train_data[col].dtype == 'object']
if non_numeric_cols:
    print(f"Non-numeric columns detected and excluded: {non_numeric_cols}")
    feature_cols = [col for col in feature_cols if col not in non_numeric_cols]

# Prepare training and testing data
X = train_data[feature_cols].values
y = train_data['readmitted_within_30days'].values
X_test = test_data[feature_cols].values


# Step 6: Cross-validation
print("Starting cross-validation...")
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

oof_preds = []
test_preds = []
auc_scores = []

params = {
    "objective": "binary:logistic",
    "learning_rate": 0.01,
    "max_depth": 6,
    "min_child_weight": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "alpha": 0.1,
    "lambda": 1.0,
    "eval_metric": "auc",
    "seed": RANDOM_SEED,
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold + 1} ---")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500,
        evals=[(dval, "validation")],
        early_stopping_rounds=30,
        verbose_eval=10
    )
    
    val_preds = model.predict(dval)
    test_preds_fold = model.predict(dtest)
    oof_preds.extend(val_preds)
    test_preds.append(test_preds_fold)
    
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} Validation AUC: {auc:.4f}")

# Step 7: Average predictions for the test set
final_test_preds = sum(test_preds) / len(test_preds)

# Step 8: Save the prediction results
test_data['readmitted_within_30days_pred'] = final_test_preds
averaged_predictions = (
    test_data.groupby('id', as_index=False)['readmitted_within_30days_pred']
    .mean()
)
averaged_predictions.to_csv('test_predictions_tfidf_cv.csv', index=False)

print(f"Mean Validation AUC: {sum(auc_scores) / len(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print("Averaged predictions saved to 'test_predictions_tfidf_cv.csv'.")

xgb with notes and MI method of feature selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from xgboost import DMatrix, train
import matplotlib.pyplot as plt

# 参数设置
RANDOM_SEED = 3612

def main():
    # 加载 notes 数据并提取 TF-IDF 特征
    print("Loading notes data and extracting TF-IDF features...")
    text_features_df = pd.read_csv('notes_features_tfidf.csv')
    tfidf_cols = [col for col in text_features_df.columns if col != 'id']

    # 加载 train 和 test 数据
    print("Loading train and test data...")
    train_data = pd.read_csv("train.csv")
    test_data = pd.read_csv("test.csv")

    # 合并 TF-IDF 特征
    print("Merging TF-IDF features with train and test data...")
    train_data = train_data.merge(text_features_df, on='id', how='left').fillna(0)
    test_data = test_data.merge(text_features_df, on='id', how='left').fillna(0)

    # 准备 TF-IDF 特征矩阵
    X = train_data[tfidf_cols].values
    y = train_data['readmitted_within_30days'].values
    X_test = test_data[tfidf_cols].values

    # 检查特征是否是数值型
    print("Validating feature matrix...")
    if not np.issubdtype(X.dtype, np.number):
        raise ValueError("Non-numeric data detected in features. Please check the preprocessing step.")

    # 1. 使用 Mutual Information 筛选特征
    print("Selecting features using Mutual Information...")
    mi_scores = mutual_info_classif(X, y, random_state=RANDOM_SEED)
    feature_importance = pd.DataFrame({'feature': tfidf_cols, 'mi_score': mi_scores})
    selected_features = feature_importance[feature_importance['mi_score'] > 0.08]['feature'].tolist()  # 阈值可调整
    print(f"Number of selected features: {len(selected_features)}")

    if len(selected_features) == 0:
        raise ValueError("No features selected after Mutual Information filtering. Adjust the threshold or preprocessing steps.")

    # 更新特征矩阵
    X = train_data[selected_features].values
    X_test = test_data[selected_features].values

    # 交叉验证
    print("Starting cross-validation...")
    kf = KFold(n_splits=6, shuffle=True, random_state=RANDOM_SEED)
    test_predictions = np.zeros(X_test.shape[0])
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"--- Fold {fold + 1} ---")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # 转换为 DMatrix 格式
        dtrain = DMatrix(X_train, label=y_train)
        dval = DMatrix(X_val, label=y_val)
        dtest = DMatrix(X_test)

        params = {
            "objective": "binary:logistic",
            "learning_rate": 0.007,
            "max_depth": 4,
            "min_child_weight": 3,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "alpha": 0.1,
            "lambda": 1,
            "eval_metric": "auc",
            "seed": RANDOM_SEED,
        }

        # 训练模型
        model = train(params, dtrain, num_boost_round=2000, evals=[(dval, "validation")],
                      early_stopping_rounds=50, verbose_eval=50)

        # 验证集AUC
        val_pred = model.predict(dval)
        val_auc = roc_auc_score(y_val, val_pred)
        fold_aucs.append(val_auc)
        print(f"Fold {fold + 1} Validation AUC: {val_auc:.4f}")

        # 累加测试集预测
        test_predictions += model.predict(dtest) / kf.n_splits

    # 打印交叉验证结果
    print(f"Mean Validation AUC: {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")

    # 保存测试集预测结果
    test_data['readmitted_within_30days_pred'] = test_predictions
    averaged_predictions = test_data[['id', 'readmitted_within_30days_pred']].groupby('id').mean()
    averaged_predictions.to_csv('test_predictions_optimized.csv')
    print("Averaged predictions saved to 'test_predictions_optimized.csv'.")

    # 可视化特征重要性
    feature_importance['mi_score'] = mi_scores
    feature_importance = feature_importance.sort_values(by='mi_score', ascending=False).head(20)
    plt.barh(feature_importance['feature'], feature_importance['mi_score'])
    plt.xlabel('Mutual Information Score')
    plt.title('Top 20 Feature Importances')
    plt.gca().invert_yaxis()
    plt.show()

if __name__ == "__main__":
    main()


random forest with notes

In [1]:
import pandas as pd
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import numpy as np

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Set the random seed for reproducibility
RANDOM_SEED = 3612

# Step 1: Load and inspect the data
print("Loading notes data...")
notes_df = pd.read_csv('notes.csv')

# Step 2: Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

notes_df['processed_text'] = notes_df['text'].apply(preprocess_text)

# Step 3: Extract TF-IDF features
print("Extracting TF-IDF features...")
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 3))  # Expand the n-gram range
text_features = tfidf.fit_transform(notes_df['processed_text']).toarray()

# Save the extracted features
text_features_df = pd.DataFrame(text_features, columns=tfidf.get_feature_names_out())
text_features_df['id'] = notes_df['id']
text_features_df.to_csv('notes_features_tfidf.csv', index=False)

# Step 4: Load training and testing data
print("Loading train and test data...")
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Merge the text features with train and test data
train_data = train_data.merge(text_features_df, on='id', how='left')
test_data = test_data.merge(text_features_df, on='id', how='left')

# Fill missing values with zero
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Step 5: Prepare features and labels
feature_cols = [col for col in train_data.columns if col not in ['id', 'readmitted_within_30days', 'text', 'processed_text']]

# Check if all feature columns are numeric
non_numeric_cols = [col for col in feature_cols if train_data[col].dtype == 'object']
if non_numeric_cols:
    print(f"Non-numeric columns detected and excluded: {non_numeric_cols}")
    feature_cols = [col for col in feature_cols if col not in non_numeric_cols]

# Prepare training and testing data
X = train_data[feature_cols].values
y = train_data['readmitted_within_30days'].values
X_test = test_data[feature_cols].values

# Step 6: Cross-validation
print("Starting cross-validation...")
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

oof_preds = []
test_preds = []
auc_scores = []

# Define Random Forest parameters
rf_params = {
    "n_estimators": 100,         # Number of trees in the forest
    "max_depth": 10,             # Maximum depth of the tree
    "min_samples_split": 2,      # Minimum number of samples required to split an internal node
    "min_samples_leaf": 1,       # Minimum number of samples required to be at a leaf node
    "max_features": 'sqrt',      # Number of features to consider when looking for the best split
    "random_state": RANDOM_SEED,
    "n_jobs": -1                  # Use all available cores
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"--- Fold {fold + 1} ---")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Initialize the Random Forest classifier
    rf = RandomForestClassifier(**rf_params)
    
    # Train the model
    rf.fit(X_train, y_train)
    
    # Predict probabilities for validation and test sets
    val_preds = rf.predict_proba(X_val)[:, 1]
    test_preds_fold = rf.predict_proba(X_test)[:, 1]
    
    # Store the predictions
    oof_preds.extend(val_preds)
    test_preds.append(test_preds_fold)
    
    # Calculate and store the AUC score
    auc = roc_auc_score(y_val, val_preds)
    auc_scores.append(auc)
    print(f"Fold {fold + 1} Validation AUC: {auc:.4f}")

# Step 7: Average predictions for the test set
final_test_preds = np.mean(test_preds, axis=0)

# Step 8: Save the prediction results
test_data['readmitted_within_30days_pred'] = final_test_preds
averaged_predictions = (
    test_data.groupby('id', as_index=False)['readmitted_within_30days_pred']
    .mean()
)
averaged_predictions.to_csv('test_predictions_tfidf_cv_rf.csv', index=False)

# Calculate and print the mean and standard deviation of the AUC scores
mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)
print(f"Mean Validation AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print("Averaged predictions saved to 'test_predictions_tfidf_cv_rf.csv'.")


[nltk_data] Downloading package stopwords to /Users/jeff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jeff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading notes data...
Extracting TF-IDF features...
Loading train and test data...
Non-numeric columns detected and excluded: ['dicom_id', 'ViewPosition', 'image_path']
Starting cross-validation...
--- Fold 1 ---
Fold 1 Validation AUC: 0.9977
--- Fold 2 ---
Fold 2 Validation AUC: 0.9990
--- Fold 3 ---
Fold 3 Validation AUC: 0.9971
--- Fold 4 ---
Fold 4 Validation AUC: 0.9979
--- Fold 5 ---
Fold 5 Validation AUC: 0.9969
Mean Validation AUC: 0.9977 ± 0.0007
Averaged predictions saved to 'test_predictions_tfidf_cv_rf.csv'.
