In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

In [6]:
train_data=pd.read_csv("/content/Hazard_train.csv")
test_data=pd.read_csv("/content/Hazard_test_share.csv")

In [7]:
train_data["data"]="train"
test_data["data"]="test"

In [8]:
all_data=pd.concat([train_data,test_data],axis=0,sort=False)


In [9]:
if "data" not in all_data.columns:
  raise KeyError("'data'column is missing! Ensure it was addedto train and test sets.")
cat_cols=all_data.select_dtypes(["object"]).columns

if "data" not in cat_cols:
  cat_cols=cat_cols.append(pd.index(["data"]))

for col in cat_cols:
  k=all_data[col].value_counts()
  cats=k[k>=100].index
  for cat in cats:
    name=f"{col}_{cat}"
    all_data[name]=(all_data[col]==cat).astype(int)


In [10]:
all_data.drop(columns=[col for col in cat_cols if col !="data"],inplace=True,errors="ignore")


In [11]:
if "data" not in all_data.columns:
  raise KeyError("'data' column is misiing after preprocessing! Check encoding steps.")

In [12]:
x_train=all_data[all_data["data"]=="train"].drop(["Id","Hazard","data"],axis=1,errors="ignore")
y_train=all_data[all_data["data"]=="train"]["Hazard"]
x_test=all_data[all_data["data"]=="test"].drop(["Id","Hazard","data"],axis=1,errors="ignore")

In [13]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

x_train shape: (40799, 109)
y_train shape: (40799,)
x_test shape: (10200, 109)


In [14]:
if x_train.empty or y_train.empty:
  raise ValueError("x_train or y_train is empty.check preprocessing steps!")


In [19]:
print(f"Train set size: {x_train.shape[0]}, Validation set size:")


Train set size: 40799, Validation set size:


In [20]:
model=HistGradientBoostingRegressor(loss="poisson",random_state=42)

In [21]:
X_train,X_val,Y_train,Y_val=train_test_split(x_train,y_train,test_size=0.2,random_state=42)


In [22]:
param_grid = {
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [23]:
best_model=grid_search.best_estimator_

In [24]:
val_preds=best_model.predict(X_val)

In [25]:
mae=mean_absolute_error(Y_val,val_preds)
print(f"Validation MAE:{mae}")

Validation MAE:2.7181300829763426


In [26]:
test_preds=best_model.predict(x_test)

In [27]:
submission=pd.DataFrame({"Hazard":test_preds})
submission.to_csv("PAVAN_submission_v1.csv",index=False)
print("Submission file saved as PAVAN_submission_v1.csv")

Submission file saved as PAVAN_submission_v1.csv
