In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif

# Define paths to your train and test data
train_path = "D:/IBA/ML/Competition/train.csv"
test_path = "D:/IBA/ML/Competition/test.csv"

# Load the data
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Separate numerical and categorical columns
numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()
numerical_columns.remove('hospital_death')
categorical_columns = train_data.columns.difference(numerical_columns).tolist()
target_column = 'hospital_death'

# Handle missing values using KNN imputation for numerical columns
imputer = KNNImputer(n_neighbors=5)
imputer.fit(train_data[numerical_columns])  # Fit the imputer on the numerical columns
train_data_imputed = train_data.copy()
test_data_imputed = test_data.copy()
train_data_imputed[numerical_columns] = imputer.transform(train_data[numerical_columns])
test_data_imputed[numerical_columns] = imputer.transform(test_data[numerical_columns])

# Perform one-hot encoding for categorical features AFTER imputation
encoder = OneHotEncoder(drop='first', sparse=False)

# Fit the encoder on the categorical columns from both training and test data
encoder.fit(pd.concat([train_data_imputed[categorical_columns], test_data_imputed[categorical_columns]]))

# Transform the categorical columns for both training and test data
X_train_encoded = encoder.transform(train_data_imputed[categorical_columns])
X_test_encoded = encoder.transform(test_data_imputed[categorical_columns])

# Combine the encoded categorical features with the numerical features
X_train = np.hstack((X_train_encoded, train_data_imputed[numerical_columns]))
X_test = np.hstack((X_test_encoded, test_data_imputed[numerical_columns]))
y_train = train_data[target_column]

# Use SelectKBest to perform feature selection
k_best = SelectKBest(score_func=f_classif, k=15)  # You can adjust the number of features (k) as needed
X_train_selected = k_best.fit_transform(X_train, y_train)
X_test_selected = k_best.transform(X_test)

# Split the data for model training and validation
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_selected, y_train, test_size=0.2, random_state=42)

# Train a Gaussian Naive Bayes classifier with hyperparameter tuning
gnb = GaussianNB()
gnb.fit(X_train_final, y_train_final)

# Make predictions on the validation set
y_val_pred = gnb.predict_proba(X_val)[:, 1]

# Calculate ROC AUC score for validation
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation ROC AUC: {roc_auc:.4f}')

# Make predictions on the test data
test_predictions = gnb.predict_proba(X_test_selected)[:, 1]

# Create a submission file in the required format
submission_df = pd.DataFrame({'RecordID': test_data['RecordID'], 'hospital_death': test_predictions})

# Save the submission file
submission_path = "D:/IBA/ML/Competition/submission_nb_with_feature_selection.csv"
submission_df.to_csv(submission_path, index=False)

KeyError: "['hospital_death'] not in index"

In [10]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, chi2

# Load training and test data
train_data = pd.read_csv("D:/IBA/ML/Competition/train.csv")  # Replace "train.csv" with your training file path
test_data = pd.read_csv("D:/IBA/ML/Competition/test.csv")    # Replace "test.csv" with your test file path

# Data preprocessing
# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
categorical_features = ["ethnicity", "gender", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
X_categorical = encoder.fit_transform(train_data[categorical_features])
test_data_categorical = encoder.transform(test_data[categorical_features])

# KNN impute missing values in both training and test data
knn_imputer = KNNImputer(n_neighbors=7)
numerical_features = [col for col in train_data.columns if col not in ["RecordID", "hospital_death"] + categorical_features]
X_numerical = train_data[numerical_features].values
test_data_numerical = test_data[numerical_features].values
X_numerical_imputed = knn_imputer.fit_transform(X_numerical)
test_data_numerical_imputed = knn_imputer.transform(test_data_numerical)

# Create feature names for one-hot encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Combine one-hot encoded categorical and KNN imputed numerical features
X = pd.DataFrame(X_categorical, columns=encoded_feature_names)
X[numerical_features] = pd.DataFrame(X_numerical_imputed, columns=numerical_features)

test_data_processed = pd.DataFrame(test_data_categorical, columns=encoded_feature_names)
test_data_processed[numerical_features] = pd.DataFrame(test_data_numerical_imputed, columns=numerical_features)

# Apply Min-Max scaling to ensure non-negative values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data_processed)

# Select the top 10 features based on statistical significance (chi-squared)
selector = SelectKBest(chi2, k=10)
X_selected = selector.fit_transform(X_scaled, train_data["hospital_death"])
test_data_selected = selector.transform(test_data_scaled)

# Target variable
y = train_data["hospital_death"]

# Train Naive Bayes on the selected features
nb_model = GaussianNB()
nb_model.fit(X_selected, y)

# Make predictions using the trained model
nb_predictions = nb_model.predict_proba(test_data_selected)

# Create submission DataFrames
submission_nb = pd.DataFrame({"RecordID": test_data["RecordID"], "hospital_death": nb_predictions[:, 1]})

# Save submission files to CSV
submission_nb.to_csv("submission_nb2 - 13.csv", index=False)


