In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Import Dataset

In [10]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
polish_companies_bankruptcy = fetch_ucirepo(id=365) 
  
# data (as pandas dataframes) 
X = polish_companies_bankruptcy.data.features 
y = polish_companies_bankruptcy.data.targets 
  


## 2.1 Data Quality and Suitability


In [11]:
# metadata 
#print(polish_companies_bankruptcy.metadata) 
  
# variable information 
print(polish_companies_bankruptcy.variables) 

print(X.info)

#print(X.describe(include='all'))

     name     role        type demographic description units missing_values
0    year  Feature     Integer        None        None  None             no
1      A1  Feature  Continuous        None        None  None             no
2      A2  Feature  Continuous        None        None  None             no
3      A3  Feature  Continuous        None        None  None             no
4      A4  Feature  Continuous        None        None  None            yes
..    ...      ...         ...         ...         ...   ...            ...
61    A61  Feature  Continuous        None        None  None            yes
62    A62  Feature  Continuous        None        None  None             no
63    A63  Feature  Continuous        None        None  None            yes
64    A64  Feature  Continuous        None        None  None            yes
65  class   Target     Integer        None        None  None             no

[66 rows x 7 columns]
<bound method DataFrame.info of        year        A1       A2   

In [12]:

np.unique(y.values)

array([0, 1], dtype=int64)

## Missing Values

In [13]:
#mean impute

X_imputed = X.apply(lambda col: col.fillna(col.mean()), axis=0)
has_missing = X_imputed.isnull().any().any()

print("Are there any missing values?", has_missing)

Are there any missing values? False


 Due to there being a large amount of features I will first run a model to deterime the most important features then do a post hoc univariant analysis on said features

## Model Buidling

In [14]:
# Split data into train, validation, and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=1)
# 0.25 x 0.8 = 0.2, so 60% train, 20% validation, 20% test

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: (26043, 65)
Validation data shape: (8681, 65)
Test data shape: (8681, 65)


In [15]:
# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [16]:
# Set XGBoost parameters
params = {
    "objective": "reg:squarederror",  # For regression tasks
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "n_estimators": 100,
    "seed": 42
}

# Train the model with validation set
eval_set = [(dtrain, "train"), (dval, "validation")]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=eval_set,
    early_stopping_rounds=10,
    verbose_eval=True
)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE) on Test Data: {rmse}")

[0]	train-rmse:0.20550	validation-rmse:0.20814
[1]	train-rmse:0.19791	validation-rmse:0.20089
[2]	train-rmse:0.19099	validation-rmse:0.19461
[3]	train-rmse:0.18500	validation-rmse:0.18928
[4]	train-rmse:0.17957	validation-rmse:0.18447
[5]	train-rmse:0.17483	validation-rmse:0.18061
[6]	train-rmse:0.17079	validation-rmse:0.17722
[7]	train-rmse:0.16750	validation-rmse:0.17469
[8]	train-rmse:0.16380	validation-rmse:0.17173
[9]	train-rmse:0.16077	validation-rmse:0.16941
[10]	train-rmse:0.15845	validation-rmse:0.16773
[11]	train-rmse:0.15648	validation-rmse:0.16632
[12]	train-rmse:0.15446	validation-rmse:0.16507
[13]	train-rmse:0.15276	validation-rmse:0.16379


Parameters: { "n_estimators" } are not used.



[14]	train-rmse:0.15037	validation-rmse:0.16169
[15]	train-rmse:0.14904	validation-rmse:0.16112
[16]	train-rmse:0.14782	validation-rmse:0.16042
[17]	train-rmse:0.14576	validation-rmse:0.15881
[18]	train-rmse:0.14426	validation-rmse:0.15764
[19]	train-rmse:0.14344	validation-rmse:0.15721
[20]	train-rmse:0.14164	validation-rmse:0.15599
[21]	train-rmse:0.14097	validation-rmse:0.15562
[22]	train-rmse:0.13995	validation-rmse:0.15474
[23]	train-rmse:0.13852	validation-rmse:0.15365
[24]	train-rmse:0.13788	validation-rmse:0.15345
[25]	train-rmse:0.13684	validation-rmse:0.15311
[26]	train-rmse:0.13534	validation-rmse:0.15209
[27]	train-rmse:0.13485	validation-rmse:0.15180
[28]	train-rmse:0.13437	validation-rmse:0.15159
[29]	train-rmse:0.13379	validation-rmse:0.15118
[30]	train-rmse:0.13314	validation-rmse:0.15076
[31]	train-rmse:0.13258	validation-rmse:0.15043
[32]	train-rmse:0.13119	validation-rmse:0.14946
[33]	train-rmse:0.13062	validation-rmse:0.14931
[34]	train-rmse:0.13004	validation-rmse: