In [2]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 1.1 MB/s eta 0:01:55
   ---------------------------------------- 0.1/124.9 MB 1.3 MB/s eta 0:01:36
   ---------------------------------------- 0.2/124.9 MB 1.0 MB/s eta 0:02:03
   ---------------------------------------- 0.2/124.9 MB 1.1 MB/s eta 0:01:49
   ---------------------------------------- 0.3/124.9 MB 1.2 MB/s eta 0:01:42
   ---------------------------------------- 0.4/124.9 MB 1.2 MB/s eta 0:01:41
   ---------------------------------------- 0.5/124.9 MB 1.3 MB/s eta 0:01:40
   ---------------------------------------- 0.5/124.9 MB 1.3 MB/s eta 0:01:37
   ---------------------------------------- 0.6/124.9 MB 1.2 MB/s eta 0:01:42
   ---------------------------------------- 0.6/124.9 MB 1.2 MB/s eta 0:01:40
 


[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define numerical and categorical features
numerical_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
categorical_features = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']

# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Encode the target labels
label_encoder = LabelEncoder()
train['Status'] = label_encoder.fit_transform(train['Status'])
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

# Split the data
X = train.drop(columns=['id', 'Status'])
y = train['Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit 
model.fit(X_train, y_train)
# Predict and evaluate
val_preds = model.predict_proba(X_val)
val_log_loss = log_loss(y_val, val_preds)
print(f'Validation Log Loss: {val_log_loss}')

# Process and predict probabilities on the test set
test_features = test.drop(columns=['id'])
test_preds = model.predict_proba(test_features)
#  submission file format
submission = pd.DataFrame(test_preds, columns=label_encoder.classes_)
submission.insert(0, 'id', test['id'])
epsilon = 1e-15# Clipping probabilities to avoid extreme values
submission[label_encoder.classes_] = submission[label_encoder.classes_].clip(epsilon, 1 - epsilon)
# Normalize probabilities
submission[label_encoder.classes_] = submission[label_encoder.classes_].div(submission[label_encoder.classes_].sum(axis=1), axis=0)


if len(submission) == 10000 and list(submission.columns) == ['id'] + list(label_encoder.classes_):
    print("Submission file format is correct.")
else:
    print("Submission file format is incorrect. Please check the number of rows and columns.")

submission.to_csv('submissionnew.csv', index=False)


Parameters: { "use_label_encoder" } are not used.



Validation Log Loss: 0.37587549859560015
Submission file format is correct.
