In [7]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.impute import KNNImputer  # Import KNNImputer for missing value handling

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace "train.csv" with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace "test.csv" with your test file path

# Data preprocessing (you may need to customize this based on your dataset)
# For simplicity, we'll encode categorical features using LabelEncoder
label_encoder = LabelEncoder()
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
for feature in categorical_features:
    train_data[feature] = label_encoder.fit_transform(train_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

# Split the training data into features and target variable
X = train_data.drop(columns=["RecordID", "hospital_death"])
y = train_data["hospital_death"]

# KNN impute missing values in both training and test data
knn_imputer = KNNImputer(n_neighbors=5)  # Set the number of neighbors
X_imputed = knn_imputer.fit_transform(X)
test_data_imputed = knn_imputer.transform(test_data.drop(columns=["RecordID"]))

# Feature selection using SelectKBest with 15 best features
selector = SelectKBest(score_func=f_classif, k=15)
X_top_15 = selector.fit_transform(X_imputed, y)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = train_data.columns[selected_feature_indices]

# Apply one-hot encoding to the selected features
X_top_15_encoded = pd.get_dummies(X[selected_feature_names], columns=selected_feature_names, drop_first=True)
test_data_top_15 = pd.get_dummies(test_data[selected_feature_names], columns=selected_feature_names, drop_first=True)

# Now you can proceed with fitting the model and making predictions
dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=100,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
dt_model.fit(X_top_15_encoded, y)
dt_predictions = dt_model.predict_proba(test_data_top_15)

# Create submission DataFrames
submission_dt = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": dt_predictions[:, 1]})

# Save submission files to CSV
submission_dt.to_csv("submission_dt_with_imputation_and_feature_selection.csv", index=False)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- d1_potassium_max_3.62
- d1_potassium_max_3.86
- d1_potassium_max_4.01
- d1_potassium_max_4.06
- d1_potassium_max_4.12
- ...
Feature names seen at fit time, yet now missing:
- apache_4a_hospital_death_prob_0.99
- d1_potassium_max_3.55
- d1_potassium_max_3.67
- d1_potassium_max_3.68
- d1_potassium_max_3.78
- ...


In [10]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace "train.csv" with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace "test.csv" with your test file path

# Data preprocessing
# For simplicity, we'll encode categorical features using LabelEncoder
label_encoder = LabelEncoder()
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
for feature in categorical_features:
    train_data[feature] = label_encoder.fit_transform(train_data[feature])
    test_data[feature] = label_encoder.transform(test_data[feature])

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# Create DataFrame with column names for one-hot encoded features
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_features)

# Combine one-hot encoded categorical and non-categorical features
X = pd.concat([pd.DataFrame(X_categorical, columns=encoded_feature_names), train_data.drop(columns=["RecordID", "hospital_death"] + categorical_features)], axis=1)
test_data_processed = pd.concat([pd.DataFrame(test_data_categorical, columns=encoded_feature_names), test_data.drop(columns=["RecordID"] + categorical_features)], axis=1)

# Split the training data into features and target variable
y = train_data["hospital_death"]

# Train and make predictions using Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X, y)

# Get feature importances
importances = dt_model.feature_importances_

# Create a DataFrame to store feature names and their importances
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": importances})

# Sort features by importance (descending order)
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Print or save the feature importance DataFrame as needed
print(feature_importance_df)

# Make predictions using the trained model
dt_predictions = dt_model.predict_proba(test_data_processed)

# Create submission DataFrames
submission_dt = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": dt_predictions[:, 1]})

# Save submission files to CSV
submission_dt.to_csv("submission_dt3 - 14.csv", index=False)



                          Feature  Importance
1                     ethnicity_2    0.001343
54              gcs_verbal_apache    0.000924
19                     icu_type_5    0.000619
59              ventilated_apache    0.000464
23         apache_3j_bodysystem_2    0.000060
..                            ...         ...
87               d1_potassium_max         NaN
88  apache_4a_hospital_death_prob         NaN
89       apache_4a_icu_death_prob         NaN
90              immunosuppression         NaN
91    solid_tumor_with_metastasis         NaN

[92 rows x 2 columns]
