In [41]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/a2/48/d5da8591891327b0faf08179d420fba3893c6134bdd449497c5329e4cb01/xgboost-2.1.0-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 93.9 kB/s eta 0:22:10
   ---------------------------------------- 0.0/124.9 MB 93.9 kB/s eta 0:22:10
   ---------------------------------------- 0.0/124.9 MB 140.9 kB/s eta 0:14:47
   ---------------------------------------- 0.1/124.9 MB 218.8 kB/s eta 0:09:31
   ---------------------------------------- 0.1/124.9 MB 218.6 kB/s eta 0:09:31
   ---------------------------------

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [73]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [74]:
numerical_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
categorical_features = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']

In [75]:
print(train.isnull().sum())

id                  0
N_Days              0
Drug             6634
Age                 0
Sex                 0
Ascites          6629
Hepatomegaly     6635
Spiders          6638
Edema               0
Bilirubin           0
Cholesterol      8526
Albumin             0
Copper           6715
Alk_Phos         6635
SGOT             6637
Tryglicerides    8569
Platelets         563
Prothrombin        20
Stage               0
Status              0
dtype: int64


In [76]:
train["Prothrombin"].unique()

array([10.2,  9.9, 10.5, 10.4, 11.2, 12. ,  9.5, 11.1, 10.6, 10.1, 11.5,
       10. , 10.8, 10.3, 11. , 11.4,  9.3, 10.9,  9.8,  9.7,  9.6, 13.6,
        nan, 11.3, 12.1, 11.9, 11.6, 11.7, 12.3, 13. , 10.7, 17.1, 12.7,
       11.8, 12.4, 13.8,  9.2, 13.3,  9. ,  9.4, 13.2, 12.9, 12.6, 12.2,
       17.4, 13.1, 18. , 14.1,  9.1, 15.2, 15. , 12.8])

In [77]:
# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [78]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Encode the target labels
label_encoder = LabelEncoder()
train['Status'] = label_encoder.fit_transform(train['Status'])

In [79]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

In [80]:
# Split the data
X = train.drop(columns=['id', 'Status'])
y = train['Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
model.fit(X_train, y_train)
# Predict and evaluate
val_preds = model.predict_proba(X_val)

Parameters: { "use_label_encoder" } are not used.



In [82]:
# Convert predictions to a DataFrame
preds_df = pd.DataFrame(val_preds, columns=model.classes_)

In [86]:
# Apply transformations to avoid extremes
epsilon = 1e-15
preds_df = preds_df.clip(lower=epsilon, upper=1-epsilon)



In [87]:
val_log_loss = log_loss(y_val, preds_df)
print(f'Validation Log Loss: {val_log_loss}')

Validation Log Loss: 0.37587549859560015


In [91]:
# Process and predict probabilities on the test set
# test_features = test.drop(columns=['id'])
test_preds = model.predict_proba(test)

In [92]:
#  submission file format
submission = pd.DataFrame(test_preds, columns=label_encoder.classes_)
submission.insert(0, 'id', test['id'])
epsilon = 1e-15# Clipping probabilities to avoid extreme values
submission[label_encoder.classes_] = submission[label_encoder.classes_].clip(epsilon, 1 - epsilon)
# Normalize probabilities
submission[label_encoder.classes_] = submission[label_encoder.classes_].div(submission[label_encoder.classes_].sum(axis=1), axis=0)


if len(submission) == 10000 and list(submission.columns) == ['id'] + list(label_encoder.classes_):
    print("Submission file format is correct.")
else:
    print("Submission file format is incorrect. Please check the number of rows and columns.")

submission.to_csv('submissionnew.csv', index=False)

Submission file format is correct.
