In [None]:
!pip install h2o
!pip install interpret
import zipfile
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
import xgboost as xgb
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [6]:
# Extract and Load Data
def extract_and_load(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        csv_filename = z.namelist()[0]
        with z.open(csv_filename) as f:
            df = pd.read_csv(f)
    return df

train_df = extract_and_load('/hmda_train_preprocessed.zip')
test_df = extract_and_load('/hmda_test_preprocessed.zip')

# Feature Selection
features = ['conforming', 'debt_to_income_ratio_std', 'debt_to_income_ratio_missing',
            'income_std', 'loan_amount_std', 'intro_rate_period_std', 'loan_to_value_ratio_std',
            'no_intro_rate_period_std', 'property_value_std', 'term_360']
target = 'high_priced'

X = train_df[features]
y = train_df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize H2O
h2o.init()
h2o_train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
h2o_val = h2o.H2OFrame(pd.concat([X_val, y_val], axis=1))

# Train Elastic Net Logistic Regression with alpha tuning
best_glm_auc = 0
best_glm = None
for alpha in [0, 0.5, 1]:
    glm = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=alpha,
                                        remove_collinear_columns=True)
    glm.train(x=features, y=target, training_frame=h2o_train, validation_frame=h2o_val)
    glm_auc = glm.auc(valid=True)
    if glm_auc > best_glm_auc:
        best_glm_auc = glm_auc
        best_glm = glm

# Train XGBoost with hyperparameter tuning
xgb_params = {
    'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 4,
    'learning_rate': 0.05, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8
}
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
bst = xgb.train(xgb_params, dtrain, num_boost_round=300, evals=[(dval, 'validation')],
                early_stopping_rounds=20)

# Train Explainable Boosting Machine (EBM) with interactions
ebm = ExplainableBoostingClassifier(interactions=10, outer_bags=10, n_jobs=-1)
ebm.fit(X_train, y_train)

# Evaluate models
phat_glm = best_glm.predict(h2o_val).as_data_frame()['p1']
phat_xgb = bst.predict(dval)
phat_ebm = ebm.predict_proba(X_val)[:, 1]

glm_auc = roc_auc_score(y_val, phat_glm)
xgb_auc = roc_auc_score(y_val, phat_xgb)
ebm_auc = roc_auc_score(y_val, phat_ebm)

print(f"GLM AUC: {glm_auc:.4f}, XGBoost AUC: {xgb_auc:.4f}, EBM AUC: {ebm_auc:.4f}")

# Predictions on Test Data
test_h2o = h2o.H2OFrame(test_df[features])
test_df['phat_glm'] = best_glm.predict(test_h2o).as_data_frame()['p1']
test_df['phat_xgb'] = bst.predict(xgb.DMatrix(test_df[features]))
test_df['phat_ebm'] = ebm.predict_proba(test_df[features])[:, 1]

# Save Submission Files
test_df[['phat_glm']].rename(columns={'phat_glm': 'phat'}).to_csv("/content/group4_glm.csv", index=False)
test_df[['phat_xgb']].rename(columns={'phat_xgb': 'phat'}).to_csv("/content/group4_xgb.csv", index=False)
test_df[['phat_ebm']].rename(columns={'phat_ebm': 'phat'}).to_csv("/content/group4_ebm.csv", index=False)

h2o.shutdown()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.26" 2025-01-21; OpenJDK Runtime Environment (build 11.0.26+4-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.26+4-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.11/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpvmf_48qr
  JVM stdout: /tmp/tmpvmf_48qr/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpvmf_48qr/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,4 months and 24 days
H2O_cluster_name:,H2O_from_python_unknownUser_851w51
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
[0]	validation-auc:0.78161
[1]	validation-auc:0.79419
[2]	validation-auc:0.79798
[3]	validation-auc:0.79974
[4]	validation-auc:0.80195
[5]	validation-auc:0.80227
[6]	validation-auc:0.80414
[7]	validation-auc:0.80537
[8]	validation-auc:0.80641
[9]	validation-auc:0.80679
[10]	validation-auc:0.80732
[11]	validation-auc:0.80726
[12]	validation-auc:0.80757
[13]	validation-auc:0.80844
[14]	validation-auc:0.80872
[15]	validation-auc:0.80917
[16]	validation-auc:0.80905
[17]	validation-auc:0.80924
[18]	validation-auc:0.80917
[19]	validation-auc:0.80939
[20]	validation-auc:0.80993
[21]	validation-auc:0.81036
[22]	validation-auc:0.81068
[23]	validation-auc:0.81097
[24]	validation-auc:0.81111
[25]	validation-auc:0.81116
[26]	validation-auc:0.81




GLM AUC: 0.7621, XGBoost AUC: 0.8263, EBM AUC: 0.8251
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%





H2O session _sid_b601 closed.


  h2o.shutdown()


In [4]:
# # Extract and Load Data
# def extract_and_load(zip_path):
#     with zipfile.ZipFile(zip_path, 'r') as z:
#         csv_filename = z.namelist()[0]
#         with z.open(csv_filename) as f:
#             df = pd.read_csv(f)
#     return df

# train_df = extract_and_load('/hmda_train_preprocessed.zip')
# test_df = extract_and_load('/hmda_test_preprocessed.zip')

# # Feature Selection
# features = ['conforming', 'debt_to_income_ratio_std', 'debt_to_income_ratio_missing',
#             'income_std', 'loan_amount_std', 'intro_rate_period_std', 'loan_to_value_ratio_std',
#             'no_intro_rate_period_std', 'property_value_std', 'term_360']
# target = 'high_priced'

# X = train_df[features]
# y = train_df[target]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize H2O
# h2o.init()
# h2o_train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
# h2o_val = h2o.H2OFrame(pd.concat([X_val, y_val], axis=1))

# # Train Elastic Net Logistic Regression
# glm = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True)
# glm.train(x=features, y=target, training_frame=h2o_train, validation_frame=h2o_val)

# # Train XGBoost Model
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dval = xgb.DMatrix(X_val, label=y_val)
# params = {'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 3, 'learning_rate': 0.1}
# bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'validation')], early_stopping_rounds=10)

# # Train Explainable Boosting Machine (EBM)
# ebm = ExplainableBoostingClassifier()
# ebm.fit(X_train, y_train)

# # Predictions on Test Data
# test_h2o = h2o.H2OFrame(test_df[features])
# test_df['phat_glm'] = glm.predict(test_h2o).as_data_frame()['p1']
# test_df['phat_xgb'] = bst.predict(xgb.DMatrix(test_df[features]))
# test_df['phat_ebm'] = ebm.predict_proba(test_df[features])[:, 1]

# # Save Submission Files
# test_df[['phat_glm']].rename(columns={'phat_glm': 'phat'}).to_csv("group4_glm.csv", index=False)
# test_df[['phat_xgb']].rename(columns={'phat_xgb': 'phat'}).to_csv("group4_xgb.csv", index=False)
# test_df[['phat_ebm']].rename(columns={'phat_ebm': 'phat'}).to_csv("group4_ebm.csv", index=False)

# h2o.shutdown()

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,4 mins 05 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,4 months and 24 days
H2O_cluster_name:,H2O_from_python_unknownUser_t3zhcq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.164 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
[0]	validation-auc:0.76718
[1]	validation-auc:0.76874
[2]	validation-auc:0.77373
[3]	validation-auc:0.77767
[4]	validation-auc:0.77761
[5]	validation-auc:0.78017
[6]	validation-auc:0.78210
[7]	validation-auc:0.78587
[8]	validation-auc:0.79003
[9]	validation-auc:0.79222
[10]	validation-auc:0.79308
[11]	validation-auc:0.79532
[12]	validation-auc:0.79788
[13]	validation-auc:0.79846
[14]	validation-auc:0.79856
[15]	validation-auc:0.80050
[16]	validation-auc:0.80136
[17]	validation-auc:0.80277
[18]	validation-auc:0.80340
[19]	validation-auc:0.80350
[20]	validation-auc:0.80447
[21]	validation-auc:0.80560
[22]	validation-auc:0.80664
[23]	validation-auc:0.80693
[24]	validation-auc:0.80751
[25]	validation-auc:0.80808
[26]	validation-auc:0.80851
[27]	validation-auc:0.80875
[28]	validation-auc:0.80924
[29]	validation-auc:0.80989
[30]	validation-auc:0.81015
[31]	validation-auc:0.81056
[32]	validation-auc:0.81077
[33]	validation-au




H2O session _sid_a632 closed.


  h2o.shutdown()
