In [80]:
import pandas as pd
data_path = "data/Training_Set.csv"
# Load the dataset
data = pd.read_csv("data/Training_Set.csv")

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Patient_ID,Gender,Admission Year,Age At Admission,Length of Stay (Days),Primary Insurance,First Potassium Days From Admit,First Potassium Result,Last Potassium Days From Admit,Last Potassium Result,...,Hx_Vent,Hx_Cath,Hx_Renal_Failure,Hx_Pvd,Hx_Valve_Procedure,Hx_Dm,Hx_Ckd,Hx_Ihd,Hx_Aortic_Valve_Problem,Hx_Prior_Admit
0,1616,FEMALE,2020,89.03944,6.008976,TBD,0.150694,3.0,5.616078,4.0,...,False,False,False,False,False,False,False,False,False,False
1,5717,MALE,2020,69.42983,2.596738,TBD,0.1125,3.9,2.358333,4.1,...,False,False,False,False,False,True,False,False,False,True
2,5922,MALE,2019,67.465759,2.046528,TBD,,,,,...,False,False,False,False,False,True,True,False,False,True
3,2054,FEMALE,2019,61.347314,1.644444,TBD,0.033941,4.2,0.055349,3.9,...,False,False,True,False,False,True,True,False,False,True
4,5810,MALE,2019,83.347254,2.253531,TBD,0.278472,4.7,1.738194,3.8,...,False,False,False,False,False,True,False,False,False,True


In [81]:
# Reload the original dataset to analyze the original non-imputed and non-encoded form
data_original = pd.read_csv(data_path)

# Calculate the proportion of the most common value and NaN values for each column
info_summary = pd.DataFrame(index=data_original.columns, columns=['Most_Common_Value_Prop', 'NaN_Prop'])

for column in data_original.columns:
    most_common_value_prop = data_original[column].value_counts(normalize=True).iloc[0]
    nan_prop = data_original[column].isna().mean()
    info_summary.loc[column] = [most_common_value_prop, nan_prop]

# Identify columns where the most common value proportion is greater than 0.9 or NaN proportion is too high
columns_low_info = info_summary[(info_summary['Most_Common_Value_Prop'] > 0.9) | (info_summary['NaN_Prop'] > 0.9)]

columns_low_info


Unnamed: 0,Most_Common_Value_Prop,NaN_Prop
Primary Insurance,1.0,0.042611
Max Troponin I Result,0.20296,0.92002
Max Troponin I Days From Admit,0.014553,0.918668
Min Troponin I Result,0.247401,0.918668
Min Troponin I Days From Admit,0.008316,0.918668
First Troponin I Result,0.21822,0.920189
First Troponin I Days From Admit,0.010417,0.918837
Last Troponin I Result,0.233684,0.919682
Last Troponin I Days From Admit,0.008386,0.919344
Hx_Cabg,0.917146,0.0


In [82]:

# Columns to be removed based on the analysis
columns_to_remove = columns_low_info.index.tolist()

# Remove these columns from the original dataset
data_cleaned = data_original.drop(columns=columns_to_remove)

# Display the shape of the original and cleaned data to confirm the removal
original_shape = data_original.shape
cleaned_shape = data_cleaned.shape

original_shape, cleaned_shape

data = data_cleaned

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Impute missing values
# For numerical features, use median
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = SimpleImputer(strategy='median')
data[numeric_features] = numeric_transformer.fit_transform(data[numeric_features])

# For categorical features, use the most frequent value
categorical_features = data.select_dtypes(include=['object', 'bool']).columns
categorical_transformer = SimpleImputer(strategy='most_frequent')
data[categorical_features] = categorical_transformer.fit_transform(data[categorical_features])

# Encode categorical features
label_encoder = LabelEncoder()
for column in categorical_features:
    data[column] = label_encoder.fit_transform(data[column])

# Split the dataset into training and testing sets
# Assuming the last column is the target variable

X = data.drop(["1Yr_Death"],axis=1)  # Features
y = data["1Yr_Death"]  # Target variable
# X = X.drop(["Primary Insurance"],axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=42)

# Output the shape of the splits to confirm
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4139, 79), (1775, 79), (4139,), (1775,))

In [84]:
# Check if the target variable is for classification or regression
# We'll determine this based on the unique values in the target
unique_values_in_target = y.unique()
num_unique_values = len(unique_values_in_target)

# If the number of unique values is relatively small compared to the number of samples,
# and the target is binary or integer, we might be dealing with a classification problem.
# Otherwise, it's likely a regression problem.

# Let's print the information to decide
num_unique_values, unique_values_in_target[:10]  # Display up to 10 unique values to get a sense


(2, array([0, 1]))

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.7509859154929578

In [86]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=100)
cat_model.fit(X_train, y_train, use_best_model=True)
accuracy = cat_model.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.6649206	total: 10.2ms	remaining: 10.2s
100:	learn: 0.3821647	total: 788ms	remaining: 7.01s
200:	learn: 0.2690582	total: 1.52s	remaining: 6.04s
300:	learn: 0.1860208	total: 2.29s	remaining: 5.31s
400:	learn: 0.1354452	total: 3.03s	remaining: 4.53s
500:	learn: 0.1013923	total: 3.75s	remaining: 3.74s
600:	learn: 0.0766004	total: 4.55s	remaining: 3.02s
700:	learn: 0.0600304	total: 5.3s	remaining: 2.26s
800:	learn: 0.0489781	total: 6.06s	remaining: 1.51s
900:	learn: 0.0390859	total: 6.84s	remaining: 752ms
999:	learn: 0.0317900	total: 7.64s	remaining: 0us
Accuracy: 0.7543661971830986


In [67]:
# from catboost import CatBoostClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.impute import SimpleImputer
# import pandas as pd

# # 假设 data_cleaned 是您已经清理好的DataFrame

# # 填充缺失值
# numeric_features_cleaned = data_cleaned.select_dtypes(include=['int64', 'float64']).columns
# categorical_features_cleaned = data_cleaned.select_dtypes(include=['object']).columns

# numeric_transformer_cleaned = SimpleImputer(strategy='median')
# data_cleaned[numeric_features_cleaned] = numeric_transformer_cleaned.fit_transform(data_cleaned[numeric_features_cleaned])

# categorical_transformer_cleaned = SimpleImputer(strategy='most_frequent', fill_value='missing')
# data_cleaned[categorical_features_cleaned] = categorical_transformer_cleaned.fit_transform(data_cleaned[categorical_features_cleaned])

# # 分割数据集
# X_cleaned = data_cleaned.drop(columns=['1Yr_Death'])  # 假定 '1Yr_Death' 是目标变量
# y_cleaned = data_cleaned['1Yr_Death'].astype(int)  # 确保目标变量是整型

# X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(
#     X_cleaned, y_cleaned, test_size=0.3, random_state=42)

# # 训练CatBoost分类器
# cat_features_indices = [X_cleaned.columns.get_loc(c) for c in categorical_features_cleaned if c in X_cleaned]

# cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, cat_features=cat_features_indices, verbose=100)
# cat_model.fit(X_train_cleaned, y_train_cleaned, eval_set=(X_test_cleaned, y_test_cleaned), use_best_model=True)

# # 评估模型
# accuracy = cat_model.score(X_test_cleaned, y_test_cleaned)
# print(f'Accuracy: {accuracy}')


In [88]:


# 假设您的模型保存在'model.pkl'文件中


KeyError: "['Hx_Cabg', 'Hx_Hypogly', 'Hx_Vent', 'Hx_Cath', 'Hx_Valve_Procedure', 'Hx_Ihd'] not in index"