In [8]:
# Step 1: Import the necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

# Step 2: Load and preprocess the training and test data
train_path = "D:/IBA/ML/Competition/train.csv"
test_path = "D:/IBA/ML/Competition/test.csv"

# Load the data
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Drop unnecessary columns (e.g., RecordID, hospital_id, icu_id) if needed
# You can also preprocess categorical features and handle missing values here

# Combine train and test data for preprocessing
combined_data = pd.concat([train_data, test_data])

# Encode categorical features using one-hot encoding
categorical_features = ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type',
                        'apache_3j_bodysystem', 'apache_2_bodysystem', 'apache_2_diagnosis', 'apache_3j_diagnosis']

combined_data = pd.get_dummies(combined_data, columns=categorical_features, drop_first=True)

# Split the combined data back into train and test
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]

# Define the features and target variable
X = train_data.drop(['hospital_death'], axis=1)
y = train_data['hospital_death']

# Impute missing values using SimpleImputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # You can choose other strategies like 'median', 'most_frequent', etc.
X = imputer.fit_transform(X)

# Ensure that all values are non-negative
X = X.clip(min=0)

# Use SelectKBest to select the top 15 features based on chi-squared test
selector = SelectKBest(score_func=chi2, k=15)
X_new = selector.fit_transform(X, y)

# Get feature names before converting to NumPy array
feature_names = train_data.drop(['hospital_death'], axis=1).columns.tolist()

# Step 3: Train a Decision Tree classifier
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 4: Use the trained model to make predictions on the test data
X_test = test_data[feature_names]  # Use the same features used for training

# Impute missing values in the test data as well
X_test = imputer.transform(X_test)

# Ensure that all values are non-negative in the test data
X_test = X_test.clip(min=0)

# Use SelectKBest to select the top 15 features for test data as well
X_test_new = selector.transform(X_test)

# Predict probabilities for class 1 (hospital_death)
test_predictions = clf.predict_proba(X_test_new)[:, 1]

# Step 5: Create a submission file in the required format
submission_df = pd.DataFrame({'RecordID': test_data['RecordID'], 'hospital_death': test_predictions})

# Step 6: Save the submission file
submission_path = "D:/IBA/ML/Competition/submission.csv"
submission_df.to_csv(submission_path, index=False)




