In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv
/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder
from sklearn.decomposition import FastICA  # Import ICA
import lightgbm as lgb
from sklearn.metrics import log_loss
from pathlib import Path

In [3]:
# Set path
path = Path('/kaggle/input/playground-series-s3e26')
original = pd.read_csv('/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv')
data = pd.read_csv(str(path)+'/train.csv')
test_df = pd.read_csv(str(path)+'/test.csv')


In [4]:
# Concatenate data
data = pd.concat([original.drop('ID', axis=1), data.drop('id', axis=1)])



In [5]:
# Engineer Numerical Features
def engineer_numerical_features(df):
    # Calculate the age at the time of outcome event
    df['Age_at_Outcome'] = df['Age'] + df['N_Days'] / 365.25
    
    # Feature engineering for other numerical columns if needed
    
    return df

data = engineer_numerical_features(data)
test_df = engineer_numerical_features(test_df)



In [6]:
# ICA Class Features for new numerical features
def get_ica_class_features(feat, n):
    ica = FastICA(n_components=n, random_state=42)
    ica_result = ica.fit_transform(data[feat].fillna(data[feat].median()).values.reshape(-1, 1))
    
    data[f'{feat}_class'] = np.argmax(ica_result, axis=1)
    
    ica_result_test = ica.transform(test_df[feat].fillna(test_df[feat].median()).values.reshape(-1, 1))
    test_df[f'{feat}_class'] = np.argmax(ica_result_test, axis=1)



In [7]:
# Applying ICA with reduced components

get_ica_class_features('Bilirubin', 4)
get_ica_class_features('Albumin', 4)
get_ica_class_features('Platelets', 3)
get_ica_class_features('Prothrombin', 3)
get_ica_class_features('Stage', 3)
get_ica_class_features('Cholesterol', 3)
get_ica_class_features('Age', 4)
get_ica_class_features('Copper', 3)



In [8]:
# Define columns
numerical_columns = [col for col in data.columns if data[col].dtype in ['int64', 'float64'] and col != 'id']
categorical_columns = [col for col in data.columns if data[col].dtype == 'object' and col != 'Status']


In [9]:
# Data preprocessing 
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_preprocessor, numerical_columns),
    ("categorical", categorical_preprocessor, categorical_columns)
])

le = LabelEncoder()
lgb_params = {'max_depth': 15, 'min_child_samples': 13, 'learning_rate': 0.05285597081335651, 'n_estimators': 284,
              'min_child_weight': 5, 'colsample_bytree': 0.10012816493265511, 'reg_alpha': 0.8767668608061822,
              'reg_lambda': 0.8705834466355764}
target_col = ['Status']
drop_col = ['id']

train_cols = numerical_columns + categorical_columns
model_dict = {}
log_scores = []

test_predict_list = list()

In [10]:
# Training models
for i in range(10):
    rkf = RepeatedKFold(n_splits=12, n_repeats=1, random_state=42 + i)
    oof_valid_probs = np.zeros((data.shape[0], 3))
    
    for fold, (train_idx, valid_idx) in enumerate(rkf.split(data)):
        X_train, y_train = data.iloc[train_idx][train_cols], data.iloc[train_idx][target_col].values.ravel()
        X_valid, y_valid = data.iloc[valid_idx][train_cols], data.iloc[valid_idx][target_col].values.ravel()

        X_train = preprocessor.fit_transform(X_train)
        X_valid = preprocessor.transform(X_valid)

        y_train = le.fit_transform(y_train)
        y_valid = le.transform(y_valid)

        model = lgb.LGBMClassifier(**lgb_params)
        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        verbose_callback = lgb.log_evaluation(150)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                  callbacks=[early_stopping_callback, verbose_callback], eval_metric='multi_logloss')
        valid_probs = model.predict_proba(X_valid)
        oof_valid_probs[valid_idx] = valid_probs

        test = preprocessor.transform(engineer_numerical_features(test_df).drop('id', axis=1))

        test_probs = model.predict_proba(test)
        test_predict_list.append(test_probs)
        score = log_loss(y_valid, valid_probs)
        model_dict[f'{fold}_{i}'] = model

    oof_log_score = log_loss(data[target_col], oof_valid_probs)
    log_scores.append(oof_log_score)

[150]	valid_0's multi_logloss: 0.429702
[150]	valid_0's multi_logloss: 0.431574
[150]	valid_0's multi_logloss: 0.414229
[150]	valid_0's multi_logloss: 0.406261
[150]	valid_0's multi_logloss: 0.436352
[150]	valid_0's multi_logloss: 0.396809
[150]	valid_0's multi_logloss: 0.442593
[150]	valid_0's multi_logloss: 0.437203
[150]	valid_0's multi_logloss: 0.438524
[150]	valid_0's multi_logloss: 0.413714
[150]	valid_0's multi_logloss: 0.408331
[150]	valid_0's multi_logloss: 0.386799
[150]	valid_0's multi_logloss: 0.416702
[150]	valid_0's multi_logloss: 0.437109
[150]	valid_0's multi_logloss: 0.418752
[150]	valid_0's multi_logloss: 0.371486
[150]	valid_0's multi_logloss: 0.438774
[150]	valid_0's multi_logloss: 0.432256
[150]	valid_0's multi_logloss: 0.395999
[150]	valid_0's multi_logloss: 0.402376
[150]	valid_0's multi_logloss: 0.451433
[150]	valid_0's multi_logloss: 0.418736
[150]	valid_0's multi_logloss: 0.455489
[150]	valid_0's multi_logloss: 0.410922
[150]	valid_0's multi_logloss: 0.407553


In [11]:
final_probs = np.mean(test_predict_list, axis=0)


In [12]:
submit = pd.DataFrame({'id': test_df['id'],
                       'Status_C': final_probs[:, 0],
                       'Status_CL': final_probs[:, 1],
                       'Status_D': final_probs[:, 2]})
submit.to_csv('submission.csv', index=False)
submit

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.352502,0.021366,0.626132
1,7906,0.529084,0.188316,0.282600
2,7907,0.041441,0.015010,0.943549
3,7908,0.976259,0.003244,0.020497
4,7909,0.852901,0.053024,0.094075
...,...,...,...,...
5266,13171,0.885462,0.057327,0.057212
5267,13172,0.957784,0.008039,0.034177
5268,13173,0.918988,0.021093,0.059919
5269,13174,0.978573,0.004613,0.016813
