IMPORTS

In [None]:
import optuna
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier,StackingClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, make_scorer,mean_squared_error
from catboost import CatBoostClassifier, Pool, cv
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')



FILES READ

In [None]:
df1 = pd.read_csv("/Users/talalkhan/Documents/Data Sets/Second Challange/train.csv")
df2 = pd.read_csv("/Users/talalkhan/Documents/Data Sets/Second Challange/test.csv")


Handling categorical data

In [None]:
#do onehot encoding for categorical columns
df1 = pd.get_dummies(df1)
df2 = pd.get_dummies(df2)



HighScore OneHotEncode

In [None]:
# Separate the features (X) and target variable (y) for training data
X_train = df1.drop('price_doc', axis=1)
y_train = df1['price_doc']

# Separate the features (X) and target variable (y) for test data
X_test = df2


In [None]:

# Identify categorical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# Perform one-hot encoding for categorical columns for training data
encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded_train = encoder.fit_transform(X_train[categorical_cols])

# Get the feature names after one-hot encoding
feature_names_encoded = encoder.get_feature_names_out(categorical_cols)
X_encoded_train_df = pd.DataFrame(X_encoded_train, columns=feature_names_encoded)
X_train.drop(categorical_cols, axis=1, inplace=True)
X_train = pd.concat([X_train, X_encoded_train_df], axis=1)

# Perform one-hot encoding for categorical columns for test data
X_encoded_test = encoder.transform(X_test[categorical_cols])
X_encoded_test_df = pd.DataFrame(X_encoded_test, columns=feature_names_encoded)
X_test.drop(categorical_cols, axis=1, inplace=True)
X_test = pd.concat([X_test, X_encoded_test_df], axis=1)


In [None]:
#another way to handle categorical data
'''# List of categorical columns to label encode
categorical_columns = ['ethnicity', 'gender', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem']

# Initialize a LabelEncoder for each categorical column
label_encoders = {}
#label_encoders2 = {}

for column in categorical_columns:
    le = LabelEncoder()
    df1[column] = le.fit_transform(df1[column])
    label_encoders[column] = le

for column in categorical_columns:
    le2 = LabelEncoder()
    df2[column] = le2.fit_transform(df2[column])
    label_encoders[column] = le2'''



Dropping low correlation columns 

In [None]:
#drop colums with low correlation RecordID,encounter_id,patient_id,hospital_id,icu_id
df1 = df1.drop(['ventilated_apache','apache_4a_hospital_death_prob','icu_stay_type_readmit', 'apache_3j_bodysystem_Gynecological', 'apache_2_bodysystem_Undefined Diagnoses'], axis=1)
df2 = df2.drop(['ventilated_apache', 'apache_4a_hospital_death_prob','icu_stay_type_readmit', 'apache_3j_bodysystem_Gynecological', 'apache_2_bodysystem_Undefined Diagnoses'], axis=1)


Handling NaN values

In [None]:
'''
#using KNNImputer to handle nan values
#df1
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = KNNImputer(n_neighbors=2500, weights='uniform')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)
'''

#'''
#using simpleimputer to handle nan values
#df1
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df1.values)
imputed_data1 = imr.transform(df1.values)
#df2
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df2.values)
imputed_data2 = imr.transform(df2.values)
#'''

# convert the imputed NumPy array back into a Pandas DataFrame
df1 = pd.DataFrame(imputed_data1, columns=df1.columns)
df2 = pd.DataFrame(imputed_data2, columns=df2.columns)


Scaling / Other minmaxing

In [None]:
#MinMaxscaling 
scaler = MinMaxScaler()
cols_to_scale = df1.columns[df1.columns != 'RecordID']
temp = df1.loc[:, ['RecordID']]
df1 = pd.DataFrame(scaler.fit_transform(df1[cols_to_scale]), columns=scaler.get_feature_names_out())
df1 = pd.concat([temp, df1], axis=1, join='inner')

cols_to_scale = df2.columns[df2.columns != 'RecordID']
temp = df2.loc[:, ['RecordID']]
df2 = pd.DataFrame(scaler.fit_transform(df2[cols_to_scale]), columns=scaler.get_feature_names_out())
df2 = pd.concat([temp, df2], axis=1, join='inner')


Robust Scaler

In [None]:
#Rohbust scaling 
scaler = RobustScaler()
cols_to_scale = df1.columns[df1.columns != 'RecordID']
temp = df1.loc[:, ['RecordID']]
df1 = pd.DataFrame(scaler.fit_transform(df1[cols_to_scale]), columns=scaler.get_feature_names_out())
df1 = pd.concat([temp, df1], axis=1, join='inner')

cols_to_scale = df2.columns[df2.columns != 'RecordID']
temp = df2.loc[:, ['RecordID']]
df2 = pd.DataFrame(scaler.fit_transform(df2[cols_to_scale]), columns=scaler.get_feature_names_out())
df2 = pd.concat([temp, df2], axis=1, join='inner')

Standard Scaler

In [None]:
# Standardize the data (mean=0, std=1)
scaler = StandardScaler()
df1 = scaler.fit_transform(df1)
df2 = scaler.transform(df2)

pca = PCA(0.95)  # Retain 95% of the variance
df1 = pca.fit_transform(df1)
df2 = pca.transform(df2)

In [None]:
df1.head()

DataSplitting

In [None]:
X = df1.loc[:, df1.columns != 'hospital_death']
y = df1['hospital_death']

#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


MODELS

Linear Regression

In [None]:
clf = LinearRegression()

Gradient Boosting


In [None]:
# Create a Gradient Boosting Classifier with tuned hyperparameters
clf = GradientBoostingClassifier(
    n_estimators=200,       # The number of boosting stages to be used
    learning_rate=0.05,      # Shrinkage parameter to prevent overfitting
    max_depth=6,            # Maximum depth of individual trees
    min_samples_split=10,    # Minimum samples required to split a node
    min_samples_leaf=5,     # Minimum samples required at each leaf node
    subsample=0.8,          # Fraction of samples used for fitting the trees
    random_state=42         # Random seed for reproducibility
)



CatBoost

In [None]:
clf = CatBoostClassifier(iterations=1700, 
                         depth=6, 
                         learning_rate=0.01, 
                         loss_function='Logloss', 
                         eval_metric='AUC',
                         random_seed=42,)

Adaptive Boost

In [None]:
# Initialize the Decision Tree classifier as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=7, random_state=42)

#base_estimator = GaussianNB() not good result

#base_estimator = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_estimators=330, max_depth=6, learning_rate= 0.03349791504065030, subsample=0.923491158880027, gamma=0.09694961288685062, reg_lambda=0.02716045699471643, min_child_weight=4.166361834440882, colsample_bytree=0.672977599702712, colsample_bylevel= 0.6497642793976, scale_pos_weight= 1.10373899695754, random_state=42)


# Initialize the AdaBoost classifier
clf = AdaBoostClassifier(base_estimator=base_estimator,
                         learning_rate= 0.01,
                         n_estimators=150,
                         random_state=42)


BaggingClassfier (DT)

In [None]:
# Define the base classifier (Decision Tree in this example)
#base_classifier = DecisionTreeClassifier(max_depth=9, min_samples_leaf=500, min_samples_split=7,random_state=42)

#ase_classifier = GaussianNB() not good !!

base_classifier = xgb.XGBClassifier(
    learning_rate= 0.0334925,
    max_depth=6,
    n_estimators=376,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    gamma=0.097, 
    reg_lambda=0.0271605,
    min_child_weight=4.166362,
    colsample_bytree=0.673,
    colsample_bylevel= 0.65,
    scale_pos_weight= 1.103739,
    subsample=0.7967162407706075
)

# Create a BaggingClassifier
clf = BaggingClassifier(base_classifier, n_estimators=350, random_state=42)


XGB

In [None]:
clf =xgb.XGBClassifier(
    learning_rate= 0.0334925,
    max_depth=6,
    n_estimators=376,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    gamma=0.097, 
    reg_lambda=0.0271605,
    min_child_weight=4.166362,
    colsample_bytree=0.673,
    colsample_bylevel= 0.65,
    scale_pos_weight= 1.103739,
    subsample=0.7967162407706075
)



Light GBM

In [None]:
#initalise Lightgbm as clf
clf = lgb.LGBMClassifier(
    objective='binary',
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=7,
    num_leaves=175,
    min_child_samples=25,
    random_state=42
)

RandomForest

In [None]:
#apply randomforest classifier
clf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=7,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
) 


Extremly RandomForestTree

In [None]:
# Initialize the ExtraTreesClassifier with appropriate hyperparameters
clf = ExtraTreesClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=10,      # Maximum depth of each tree (adjust as needed)
    min_samples_split=9,  # Minimum number of samples required to split a node
    min_samples_leaf=50,   # Minimum number of samples required at each leaf node
    #max_features='sqrt',  # Number of features to consider when looking for the best split
    random_state=42
)


Voting Classifier

In [None]:
# Define individual classifiers
# Replace these with your choice of classifiers
classifier1 = CatBoostClassifier(iterations=1700, depth=6, learning_rate=0.01, loss_function='Logloss', eval_metric='AUC', random_seed=42)
classifier2 = lgb.LGBMClassifier(objective='binary', learning_rate=0.1, n_estimators=800, max_depth=7, num_leaves=175, min_child_samples=25, random_state=42)
classifier3 = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_estimators=330, max_depth=6, learning_rate= 0.03349791504065030, subsample=0.923491158880027, gamma=0.09694961288685062, reg_lambda=0.02716045699471643, min_child_weight=4.166361834440882, colsample_bytree=0.672977599702712, colsample_bylevel= 0.6497642793976, scale_pos_weight= 1.10373899695754, random_state=42)

# Create a VotingClassifier
clf = VotingClassifier(
    estimators=[('clf1', classifier1), ('clf2', classifier2), ('clf3', classifier3)],
    voting='soft'  # 'soft' for probability voting, 'hard' for majority voting
)

#clf = BaggingClassifier(clf1, n_estimators=100, random_state=42)



Stacking

In [None]:
# Initialize the XGBoost classifier
meta_model = xgb.XGBClassifier(
   learning_rate= 0.015,
        max_depth=2,
        n_estimators=395,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        gamma=0.09, 
        reg_lambda=0.02,
        min_child_weight=4,
        colsample_bytree=0.6,
        colsample_bylevel= 0.62,
        scale_pos_weight= 1.1,
        subsample=0.8
)

# Create a list of estimators including CatBoost and LightGBM
estimators = [
    ('cat', CatBoostClassifier(
   learning_rate=0.050,
    n_estimators=476,
    depth=8,
    l2_leaf_reg=8.0841,
    subsample=0.791,
    objective='Logloss',
    random_state=42,
)),
    
    
    
    ('lgb', lgb.LGBMClassifier(
    learning_rate=0.071,
    n_estimators=395,
    max_depth=10,
    lambda_l2=9.957,
    subsample=0.79,
    num_leaves=11,
    objective='binary',
    random_state=42)),
     
     
    ('xgb', xgb.XGBClassifier(
   learning_rate= 0.0335,
        max_depth=6,
        n_estimators=385,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        gamma=0.0969, 
        reg_lambda=0.0271,
        min_child_weight=4.167,
        colsample_bytree=0.673,
        colsample_bylevel= 0.65,
        scale_pos_weight= 1.10,
        subsample=0.798
))  # Use the XGBoost model we defined earlier
]

# Create a stacking classifier with XGBoost as the final estimator
clf1 = StackingClassifier(estimators=estimators, final_estimator=meta_model)

clf = BaggingClassifier(clf1, n_estimators=100, random_state=42)



GridSearchCV

In [None]:
#CAT BOOST
# Define the hyperparameter grid for GridSearchCV
'''param_grid = {
    'iterations': [500,100,1500],      # Number of iterations
    'depth': [4,7,9],                # Tree depth
    'learning_rate': [0.01,0.001],  # Learning rate
    'loss_function': ['Logloss'],       # Loss function
    'eval_metric': ['AUC'],             # Evaluation metric
}'''
#LIGHTGBM
param_grid = {
    'n_estimators': [200,300,500],    # Number of boosting rounds
    'learning_rate': [0.05, 0.1, 0.2],  # Learning rate
    'max_depth': [5, 7, 9],            # Maximum depth of trees
    'num_leaves': [31, 63, 127],       # Maximum number of leaves in one tree
    'min_child_samples': [10, 20, 30],  # Minimum number of data points in leaves
}


# Initialize GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)

# Perform GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and their corresponding AUC score
best_params = grid_search.best_params_
best_auc = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best AUC Score:", best_auc)

Optuna Search

In [None]:
'''def objective(trial):
    # Load and preprocess your training data (X, y)

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define XGBoost parameters to search
    params = {
        'objective': 'binary:logistic',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'n_estimators': trial.suggest_loguniform('n_estimators', 100,5000),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.5),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
    }

    # Create and train the XGBoost model
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    model = xgb.train(params, dtrain, evals=[(dval, 'eval')], early_stopping_rounds=50, verbose_eval=False)

    # Make predictions on the validation set
    y_prob = model.predict(dval)

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_val, y_prob)

    return roc_auc
'''
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.01, 1.0),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.01, 0.5),
        'max_features': trial.suggest_int('max_features', 10,100),
        #'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    
    # Create the ExtraTreesClassifier with the suggested hyperparameters
    clf = ExtraTreesClassifier(**params, random_state=42)
    
    clf.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = clf.predict_proba(X_test)[:, 1]  # Probability of class 1
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc



In [None]:

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)  # You can adjust the number of trials

# Get the best parameters
best_params = study.best_params
best_auc = study.best_value

print(f'Best ROC AUC: {best_auc:.4f}')
print(f'Best Parameters: {best_params}')


FIT

In [None]:
clf.fit(X_train, y_train)


In [None]:
best_clf = ExtraTreesClassifier(**best_params, random_state=42)
best_clf.fit(X_train, y_train)

# Optionally, you can evaluate the model on a validation or test set.
#y_pred = best_clf.predict(df2)
#accuracy = accuracy_score(y_val, y_pred)


PREDICTIONS

In [None]:
#make predictions
y_pred = clf.predict(X_test)
pred = clf.predict_proba(df2)[:,1]

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(X_test, y_pred)
print(f'Mean Squared Error: {mse}')



Use for OneHotEncoder

In [None]:
# Make predictions on the test data
y_pred = clf.predict(X_test)
pred = clf.predict_proba(X_test)[:, 1]


ACCURACY CHECK

In [None]:
#pred = md_pred[:,1]
#print mdpred up to 6 decimal places
print(pred.round(6))
#calculate accuracy
score = accuracy_score(X_test, y_pred)
print('Accuracy: %.3f' % score)

print(classification_report(X_test, y_pred))

ROC Curve

In [None]:
# Calculate ROC curve and AUC for the ExtraTreesClassifier
pred = clf.predict_proba(df2)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Calculate ROC AUC score
roc_auc_score = roc_auc_score(y_test, pred)
print(f'ROC AUC Score: {roc_auc_score:.4f}')

CSV FILE GENERATION

In [None]:
# Create a DataFrame for the results with RecordID and predicted probability of death
results_df = pd.DataFrame({'row ID': df2['row ID'] , 'price_doc': pred})

# Save the results to a CSV file
results_df.to_csv('submission2_25253.csv', index=False)

