In [4]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier,StackingClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import ParameterGrid



In [2]:
# Load your training dataset (replace 'train.csv' with your actual training data file)
train_data = pd.read_csv('/Users/talalkhan/Documents/Data Sets/train.csv')

# Load your test dataset (replace 'test.csv' with your actual test data file)
test_data = pd.read_csv('/Users/talalkhan/Documents/Data Sets/test.csv')

# Separate the features (X) and target variable (y) for training data
X_train = train_data.drop('hospital_death', axis=1)
y_train = train_data['hospital_death']

# Separate the features (X) and target variable (y) for test data
X_test = test_data


In [3]:

# Identify categorical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# Perform one-hot encoding for categorical columns for training data
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded_train = encoder.fit_transform(X_train[categorical_cols])

# Get the feature names after one-hot encoding
feature_names_encoded = encoder.get_feature_names_out(categorical_cols)
X_encoded_train_df = pd.DataFrame(X_encoded_train, columns=feature_names_encoded)
X_train.drop(categorical_cols, axis=1, inplace=True)
X_train = pd.concat([X_train, X_encoded_train_df], axis=1)

# Perform one-hot encoding for categorical columns for test data
X_encoded_test = encoder.transform(X_test[categorical_cols])
X_encoded_test_df = pd.DataFrame(X_encoded_test, columns=feature_names_encoded)
X_test.drop(categorical_cols, axis=1, inplace=True)
X_test = pd.concat([X_test, X_encoded_test_df], axis=1)




In [9]:


# Initialize the XGBoost classifier
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=330,
    max_depth=6,
    learning_rate= 0.03349791504065030,
    subsample=0.923491158880027,         # You can adjust this
    random_state=42,    # For reproducibility
    gamma=0.09694961288685062, 
    reg_lambda=0.02716045699471643,
    min_child_weight=4.166361834440882,
    colsample_bytree=0.672977599702712,
    colsample_bylevel= 0.6497642793976,
    scale_pos_weight= 1.10373899695754
)



NameError: name 'y_test' is not defined

In [11]:
model = CatBoostClassifier(iterations=1700, 
                         depth=6, 
                         learning_rate=0.01, 
                         loss_function='Logloss', 
                         eval_metric='AUC',
                         random_seed=42,)

In [5]:
base_classifier = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=330,
    max_depth=6,
    learning_rate= 0.03349791504065030,
    subsample=0.923491158880027,         # You can adjust this
    random_state=42,    # For reproducibility
    gamma=0.09694961288685062, 
    reg_lambda=0.02716045699471643,
    min_child_weight=4.166361834440882,
    colsample_bytree=0.672977599702712,
    colsample_bylevel= 0.6497642793976,
    scale_pos_weight= 1.10373899695754
)
#apply boosting
model = BaggingClassifier(base_classifier, n_estimators=500, random_state=42)


In [12]:
#CAT BOOST
# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'iterations': [1500,1700,2000],      # Number of iterations
    'depth': [6,9,10],                # Tree depth
    'learning_rate': [0.01,0.001],  # Learning rate
    'loss_function': ['Logloss'],       # Loss function
    'eval_metric': ['AUC'],             # Evaluation metric
}
'''#LIGHTGBM
param_grid = {
    'n_estimators': [200,300,500],    # Number of boosting rounds
    'learning_rate': [0.05, 0.1, 0.2],  # Learning rate
    'max_depth': [5, 7, 9],            # Maximum depth of trees
    'num_leaves': [31, 63, 127],       # Maximum number of leaves in one tree
    'min_child_samples': [10, 20, 30],  # Minimum number of data points in leaves
}'''


# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)

# Perform GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and their corresponding AUC score
best_params = grid_search.best_params_
best_auc = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best AUC Score:", best_auc)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	total: 112ms	remaining: 2m 48s
1:	total: 146ms	remaining: 1m 49s
0:	total: 156ms	remaining: 3m 53s
0:	total: 107ms	remaining: 2m 40s
2:	total: 161ms	remaining: 1m 20s
1:	total: 124ms	remaining: 1m 32s
1:	total: 181ms	remaining: 2m 15s
2:	total: 139ms	remaining: 1m 9s
2:	total: 203ms	remaining: 1m 41s
3:	total: 164ms	remaining: 1m 1s
3:	total: 207ms	remaining: 1m 17s
0:	total: 90.7ms	remaining: 2m 34s
4:	total: 191ms	remaining: 57s
3:	total: 244ms	remaining: 1m 31s
0:	total: 89.2ms	remaining: 2m 13s
5:	total: 201ms	remaining: 50.1s
4:	total: 253ms	remaining: 1m 15s
1:	total: 131ms	remaining: 1m 51s
0:	total: 132ms	remaining: 3m 17s
0:	total: 111ms	remaining: 2m 46s
1:	total: 117ms	remaining: 1m 27s
4:	total: 285ms	remaining: 1m 25s
1:	total: 139ms	remaining: 1m 43s
5:	total: 290ms	remaining: 1m 12s
1:	total: 168ms	remaining: 2m 6s
6:	total: 251ms	remaining: 53.5s
5:	total: 306ms	remaining: 1m 16s
2:	total: 151ms	remaining: 

In [6]:
# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)
y_probabilities = model.predict_proba(X_test)[:, 1]


In [7]:
record_ids = test_data['RecordID']
results_df = pd.DataFrame({'RecordID': record_ids, 'PredictedValue': y_probabilities})
results_df.to_csv('submission92_25253.csv', index=False)
