In [56]:
# import os

# if os.path.exists("submission.csv"):
#     os.remove("submission.csv")

# if os.path.exists("best_model.pth"):
#     os.remove("best_model.pth")

# Imports

In [57]:
import pandas as pd
import numpy as np

from scipy.stats import spearmanr
from scipy import stats
from scipy.stats import boxcox

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets

In [58]:
train_dataset = pd.read_csv("/kaggle/input/nti-r-1-beyond/train.csv")
test_df = pd.read_csv("/kaggle/input/nti-r-1-beyond/test.csv")

# Preprocessing Functions

**One Hot Encoding**

In [59]:
def one_hot_encoding(dataset, cols):
    for col in cols:
        dummies = pd.get_dummies(dataset[col], prefix=col).astype(int)
        dataset = dataset.drop(columns=[col])
        dataset = pd.concat([dataset, dummies], axis=1)
    
    return dataset

**Label Encoding**

In [60]:
def label_encoding(train_df, val_df, target_col):
    label_encoder = LabelEncoder()
    
    train_df[target_col] = label_encoder.fit_transform(train_df[target_col]).astype(int)
    val_df[target_col]   = label_encoder.transform(val_df[target_col]).astype(int)
    
    return train_df, val_df

**Check the distribution and outliers**

In [61]:
def outliers_check(dataset, col):
    print(f"--- Summary for {col} ---")
    print(dataset[col].describe())

    # Detect outliers using IQR
    Q1 = dataset[col].quantile(0.25)
    Q3 = dataset[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    lower_outliers = dataset[col] < lower_bound
    upper_outliers = dataset[col] > upper_bound
    outliers = dataset[lower_outliers | upper_outliers]

    # Print thresholds and counts
    print(f"\nLower bound: {lower_bound:.3f}")
    print(f"Upper bound: {upper_bound:.3f}")

    # View Outliers
    print(f"\nTotal number of outliers: {outliers.shape[0]}")
    lower_vals = np.sort(dataset.loc[lower_outliers, col].values)
    upper_vals = np.sort(dataset.loc[upper_outliers, col].values)

    if not outliers.empty:
        print(f"Lower Outliers ({len(lower_vals)}): {lower_vals}")
        print(f"Upper Outliers ({len(upper_vals)}): {upper_vals}")

    # Skewness check
    print(f"Skewness ({col}): {dataset[col].skew()}\n")

    # Plot distribution
    plt.figure(figsize=(10, 4))
    sns.histplot(dataset[col], bins=30, kde=True, color='skyblue')
    plt.title(f'{col} Distribution')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

    # Plot boxplot
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=dataset[col], color='lightgreen')
    plt.title(f'{col} Boxplot')
    plt.show()


**Log Transformation**

In [62]:
def log_transform(train_df, val_df, test_df, cols):
    shifts = {col: abs(train_df[col].min()) + 1 if train_df[col].min() <= 0 else 0
              for col in cols}

    for col in cols:
        train_df[col] = np.log1p(train_df[col] + shifts[col])
        val_df[col]   = np.log1p(val_df[col]   + shifts[col])
        test_df[col]  = np.log1p(test_df[col]  + shifts[col])

    return train_df, val_df, test_df

**Boxcox train transformation**

In [63]:
def boxcox_train(dataset, cols):
    fitted_lambdas = {}
    shifts = {}
    
    for col in cols:
        min_val = dataset[col].min()
        shift = abs(min_val) + 1 if min_val <= 0 else 0
        data_to_transform = dataset[col] + shift
        
        transformed, fitted_lambda = boxcox(data_to_transform)
        dataset[col] = transformed
        
        fitted_lambdas[col] = fitted_lambda
        shifts[col] = shift
    
    return dataset, fitted_lambdas, shifts

**Boxcox test/val transformation**

In [64]:
def boxcox_apply(dataset, cols, fitted_lambdas, shifts):
    for col in cols:
        dataset[col] = boxcox(dataset[col] + shifts[col], lmbda=fitted_lambdas[col])
    return dataset


**Compare skewness between similar 2 columns**

In [65]:
def compare_skewness(dataset, col, new_col):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    dataset[col].hist(bins=50)
    plt.title(f"Original {col}")

    plt.subplot(1, 2, 2)
    dataset[new_col].hist(bins=50)
    plt.title(new_col)
    plt.show()


**Iterative Imputing**

In [66]:
def fit_predictive_imputer(df, random_state=42):
    df_imputed = df.copy()
    numeric_cols = df_imputed.select_dtypes(include='number').columns
    cat_cols = df_imputed.select_dtypes(include='object').columns
    
    # Numeric imputer
    num_imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
    max_iter=10,
    random_state=random_state
    )

    if len(numeric_cols) > 0:
        df_imputed[numeric_cols] = num_imputer.fit_transform(df_imputed[numeric_cols])
    
    # Categorical imputers
    cat_imputer_dict = {}
    for col in cat_cols:
        df_imputed[col] = df_imputed[col].astype('category')
        df_imputed[col] = df_imputed[col].cat.codes  # NaNs -> -1

        cat_imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
        max_iter=10,
        random_state=random_state
        )

        df_imputed[[col]] = cat_imputer.fit_transform(df_imputed[[col]])
        cat_imputer_dict[col] = cat_imputer

        # Convert back to original categories
        codes = df_imputed[col].round().astype(int)
        categories = df[col].astype('category').cat.categories
        codes = codes.clip(0, len(categories)-1)
        df_imputed[col] = pd.Categorical.from_codes(codes, categories=categories)
    
    return df_imputed, num_imputer, cat_imputer_dict

In [67]:
def transform_predictive_imputer(df, num_imputer, cat_imputer_dict):
    df_imputed = df.copy()
    numeric_cols = df_imputed.select_dtypes(include='number').columns
    cat_cols = df_imputed.select_dtypes(include='object').columns

    # Numeric
    if len(numeric_cols) > 0:
        df_imputed[numeric_cols] = num_imputer.transform(df_imputed[numeric_cols])

    # Categorical
    for col in cat_cols:
        df_imputed[col] = df_imputed[col].astype('category').cat.codes
        cat_imputer = cat_imputer_dict[col]
        df_imputed[[col]] = cat_imputer.transform(df_imputed[[col]])
        # Convert back to original categories
        codes = df_imputed[col].round().astype(int)
        categories = df[col].astype('category').cat.categories
        codes = codes.clip(0, len(categories)-1)
        df_imputed[col] = pd.Categorical.from_codes(codes, categories=categories)
    
    return df_imputed

# Features Analysis

**Heatmap**

In [68]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assume corr_matrix is your full correlation matrix from previous step
# plt.figure(figsize=(12, 10))

# # Use mask for upper triangle (optional)
# mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# # Plot heatmap
# sns.heatmap(
#     corr_matrix.astype(float),
#     annot=True,
#     fmt=".2f",
#     cmap="coolwarm",
#     mask=mask,
#     cbar_kws={'label': 'Correlation'},
#     square=True
# )

# plt.title("Full Correlation Matrix (Numeric + Categorical)")
# plt.tight_layout()
# plt.show()


**Drug**

In [69]:
def Drug_Analysis(dataset):
    dataset['Drug'].fillna("Unknown", inplace=True)
    # print(dataset['Drug'].value_counts(dropna=False))
    # print(pd.crosstab(dataset['Drug'], dataset['Status'], normalize='index')) # Check Corr with target
    dataset = one_hot_encoding(dataset, "Drug")
    return dataset

**Age**

In [70]:
def Age_Analysis(dataset):
    dataset['Age'] = dataset['Age'] / 365
    dataset['Age'] = dataset['Age'].clip(upper=84.405)
    dataset, colm, _ = boxcox_transform(dataset, "Age", False)
    # outliers_check(dataset, "Age")
    return dataset

**Sex**

In [71]:
def Sex_Analysis(dataset):
    # print(dataset["Sex"].value_counts())

    dataset = one_hot_encoding(dataset, "Sex")
    return dataset

**Ascites**

In [72]:
def Ascites_Analysis(dataset):
    dataset['Ascites'].fillna("Unknown", inplace=True)
    # print(dataset['Ascites'].value_counts(dropna=False))
    # print(pd.crosstab(dataset['Ascites'], dataset['Status'], normalize='index')) # Check Corr with target
    dataset = one_hot_encoding(dataset, "Ascites")
    return dataset

**Hepatomegaly**

In [73]:
def Hepatomegaly_Analysis(dataset):
    # print(dataset['Hepatomegaly'].value_counts(dropna=False))    
    dataset['Hepatomegaly'] = dataset['Hepatomegaly'].replace('S', dataset['Hepatomegaly'].mode()[0])
    # print(pd.crosstab(dataset['Hepatomegaly'], dataset['Status'], normalize='index')) # Check Corr with target
    dataset = one_hot_encoding(dataset, "Hepatomegaly")
    return dataset


**Spiders**

In [74]:
def Spiders_Analysis(dataset):
    dataset['Spiders'].fillna("Unknown", inplace=True)
    # print(dataset['Spiders'].value_counts(dropna=False))
    
    # Check Corr with target
    # print(pd.crosstab(dataset['Spiders'], dataset['Status'], normalize='index'))
    dataset = one_hot_encoding(dataset, "Spiders")
    return dataset


**Edema**

In [75]:
def Edema_Analysis(dataset):
    # print(dataset['Edema'].value_counts())
    # print(pd.crosstab(dataset['Edema'], dataset['Status'], normalize='index')) # Check Corr with target
    edema_map = {"N": 0, "S": 1, "Y": 2}
    dataset['Edema'] = dataset['Edema'].map(edema_map)
    return dataset

**Bilirubin**

In [76]:
def Bilirubin_Analysis(dataset):
    # outliers_check(dataset, "Bilirubin")
    dataset, new_col, _ = boxcox_transform(dataset, "Bilirubin", False)
    # outliers_check(dataset, new_col)
    return dataset

**Cholesterol**

In [77]:
def Cholesterol_Analysis(dataset):
    dataset['Cholesterol_missing'] = dataset['Cholesterol'].isna().astype(int) # Missing flag
    # outliers_check(dataset, "Cholesterol")

    # # Impute with KNN
    # if fit:  # Training set
    #     imputer = KNNImputer(n_neighbors=5)
    #     dataset[['Cholesterol']] = imputer.fit_transform(dataset[['Cholesterol']])
    # else:    # Test set
    #     dataset[['Cholesterol']] = imputer.transform(dataset[['Cholesterol']])

    dataset, new_col, _ = boxcox_transform(dataset, "Cholesterol", False)
    # outliers_check(dataset, new_col)
    return dataset

**Albumin**

In [78]:
def Albumin_Analysis(dataset):
    # outliers_check(dataset, "Albumin")
    dataset, new_col, _ = boxcox_transform(dataset, "Albumin", False)
    # outliers_check(dataset, new_col)
    return dataset

**Copper**

In [79]:
def Copper_Analysis(dataset, target="Status"):
    dataset['Copper_missing'] = dataset['Copper'].isna().astype(int)  # Missing flag

    # Impute Copper with KNN
    # if fit:  # training set
    #     imputer = KNNImputer(n_neighbors=5)
    #     dataset[['Copper']] = imputer.fit_transform(dataset[['Copper']])
    # else:    # test set
    #     dataset[['Copper']] = imputer.transform(dataset[['Copper']])

    # Check relation with target
    # groups = [dataset.loc[dataset[target] == t, 'Copper'].dropna() for t in dataset[target].unique()]
    # stat, p = stats.kruskal(*groups)
    # print(f"\nKruskal-Wallis test for Copper vs {target}: stat={stat:.3f}, p-value={p:.4f}")

    # Effect size (η²)
    # n_total = sum(len(g) for g in groups)
    # eta_sq = stat / (n_total - 1)
    # print(f"Effect size η²: {eta_sq:.4f}")

    # Outlier checks
    # outliers_check(dataset, "Copper")
    dataset, new_col, _ = boxcox_transform(dataset, "Copper", False)
    # outliers_check(dataset, new_col)

    return dataset

**Alk_Phos**

In [80]:
def AlkPhos_Analysis(dataset, target="Status"):
    dataset['AlkPhos_missing'] = dataset['Alk_Phos'].isna().astype(int) # Missing flag

    # KNN imputation
    # if fit:  # training set
    #     imputer = KNNImputer(n_neighbors=5)
    #     dataset[['Alk_Phos']] = imputer.fit_transform(dataset[['Alk_Phos']])
    # else:  # test set
    #     dataset[['Alk_Phos']] = imputer.transform(dataset[['Alk_Phos']])
        
    # outliers_check(dataset, "Alk_Phos")
    dataset, col, _ = boxcox_transform(dataset, "Alk_Phos", False)
    # outliers_check(dataset, col)

    # Relation with target
    # groups = [dataset.loc[dataset[target]==t, col] for t in dataset[target].unique()]
    # stat, p = stats.kruskal(*groups)
    # print(f"Kruskal-Wallis test for {col} vs {target}: stat={stat:.3f}, p={p:.4f}")

    return dataset, col

In [81]:
def Alk_Phos_Corr(dataset):
    # Ensure types
    dataset['Stage'] = pd.to_numeric(dataset['Stage'], errors='coerce')
    dataset['Status'] = dataset['Status'].astype('category')

    feat = 'Alk_Phos'

    print('Missing count:', dataset[feat].isna().sum())
    print('By Stage describe:\n', dataset.groupby('Stage')[feat].describe())
    print('By Status describe:\n', dataset.groupby('Status')[feat].describe())

    # Boxplots across Stage
    plt.figure(figsize=(8, 5))
    dataset.boxplot(column=feat, by='Stage')
    plt.title(f'{feat} by Stage')
    plt.suptitle('')
    plt.xlabel('Stage')
    plt.ylabel(feat)
    plt.show()

    # Boxplots across Status
    plt.figure(figsize=(6, 5))
    dataset.boxplot(column=feat, by='Status')
    plt.title(f'{feat} by Status')
    plt.suptitle('')
    plt.xlabel('Status')
    plt.ylabel(feat)
    plt.show()

    # Nonparametric tests (multi-group)
    # Kruskal-Wallis: Stage (treat as groups)
    stage_groups = [g[feat].dropna().values for _, g in dataset.groupby('Stage')]
    kw_stat, kw_p = stats.kruskal(*stage_groups)
    print(f'Kruskal–Wallis {feat} ~ Stage: stat={kw_stat:.3f}, p={kw_p:.4g}')

    # If Stage is ordinal, also check monotonic trend via Spearman
    spr_r, spr_p = stats.spearmanr(dataset['Stage'], dataset[feat], nan_policy='omit')
    print(f'Spearman(Stage, {feat}): r={spr_r:.3f}, p={spr_p:.4g}')

    # Kruskal–Wallis for Status (categorical)
    status_groups = [g[feat].dropna().values for _, g in dataset.groupby('Status')]
    kw_stat_s, kw_p_s = stats.kruskal(*status_groups)
    print(f'Kruskal–Wallis {feat} ~ Status: stat={kw_stat_s:.3f}, p={kw_p_s:.4g}')

In [82]:
def Alk_Phos_Boxcox_log(dataset):
    # Make safe positive array for Box-Cox
    x = dataset['Alk_Phos'].dropna().values

    # Box-Cox
    x_bc, lam = stats.boxcox(x)
    dataset.loc[dataset['Alk_Phos'].notna(), 'Alk_Phos_boxcox'] = x_bc
    print(f'Box-Cox lambda for Alk_Phos: {lam:.4f}')

    # Log1p
    dataset['Alk_Phos_log1p'] = np.log1p(dataset['Alk_Phos'])

    # Compare skewness
    sk_raw = stats.skew(dataset['Alk_Phos'].dropna())
    sk_bc = stats.skew(dataset['Alk_Phos_boxcox'].dropna())
    sk_log = stats.skew(dataset['Alk_Phos_log1p'].dropna())
    print(f'Skew raw={sk_raw:.3f}, boxcox={sk_bc:.3f}, log1p={sk_log:.3f}')

    # box plots for transformed features
    for col in ['Alk_Phos_boxcox', 'Alk_Phos_log1p']:
        plt.figure(figsize=(8, 5))
        dataset.boxplot(column=col, by='Stage')
        plt.title(f'{col} by Stage')
        plt.suptitle('')
        plt.xlabel('Stage')
        plt.ylabel(col)
        plt.show()

In [83]:
def Alk_Phos_outliers(dataset, col):
    q99 = dataset[col].quantile(0.99)
    late_stage_cut = dataset['Stage'].quantile(0.75)  # top quartile of Stage considered "late"
    is_extreme = dataset[col] >= q99
    is_late = dataset['Stage'] >= late_stage_cut

    tab = pd.crosstab(is_extreme, is_late, dropna=False)
    print('Extreme vs Late-stage contingency:\n', tab)
    late_share = (is_extreme & is_late).sum() / is_extreme.sum() if is_extreme.sum() else 0
    print(f'Share of extreme values in late stage: {late_share:.1%}')


**SGOT**

In [84]:
def SGOT_Analysis(dataset, target="Status"):
    dataset['SGOT_missing'] = dataset['SGOT'].isna().astype(int) # Missing flag

    # Median imputation grouped by Stage
    dataset['SGOT'] = dataset.groupby('Stage')['SGOT'].transform(
        lambda x: x.fillna(x.median())
    )

    # Outliers before transform
    # outliers_check(dataset, "SGOT")

    # Transformations
    dataset, log_col = log_transform(dataset, "SGOT", False)
    # dataset, boxcox_col, _ = boxcox_transform(dataset, "SGOT", False)

    # Association tests for both versions
    # for col in [log_col, boxcox_col]:
    #     groups = [dataset.loc[dataset[target]==t, col] for t in dataset[target].unique()]
    #     stat, p = stats.kruskal(*groups)
    #     print(f"Kruskal-Wallis for {col} vs {target}: stat={stat:.3f}, p={p:.4e}")
    #     outliers_check(dataset, col)

    return dataset

**Tryglicerides**

In [85]:
def Tryglicerides_Analysis(dataset, target="Status"):
    dataset['Tryglicerides_missing'] = dataset['Tryglicerides'].isna().astype(int) # Missing flag

    # Median imputation grouped by Stage
    dataset['Tryglicerides'] = dataset.groupby('Stage')['Tryglicerides'].transform(
        lambda x: x.fillna(x.median())
    )
    # outliers_check(dataset, "Tryglicerides")

    # Transformation
    dataset, log_col = log_transform(dataset, "Tryglicerides", False)

    # Association with target
    # groups = [dataset.loc[dataset[target] == t, log_col] for t in dataset[target].unique()]
    # stat, p = stats.kruskal(*groups)
    # print(f"Kruskal-Wallis for {log_col} vs {target}: stat={stat:.3f}, p={p:.4e}")
    # outliers_check(dataset, log_col)

    return dataset

**Platelets**

In [86]:
def Platelets_Analysis(dataset, target="Status"):
    # Missing flag
    dataset['Platelets_missing'] = dataset['Platelets'].isna().astype(int)

    # Impute by Stage median
    dataset['Platelets'] = dataset.groupby('Stage')['Platelets'].transform(
        lambda x: x.fillna(x.median())
    )

    # Outlier check
    # outliers_check(dataset, "Platelets")

    # Transform
    # dataset, log_col = log_transform(dataset, "Platelets")
    dataset, boxcox_col, _ = boxcox_transform(dataset, "Platelets", False)

    # Association test
    # for col in ["Platelets", log_col, boxcox_col]:
    #     groups = [dataset.loc[dataset[target]==t, col] for t in dataset[target].unique()]
    #     stat, p = stats.kruskal(*groups)
    #     print(f"Kruskal-Wallis for {col} vs {target}: stat={stat:.3f}, p={p:.4e}")

    # outliers_check(dataset, log_col)
    # outliers_check(dataset, boxcox_col)

    return dataset

**Prothrombin**

In [87]:
def Prothrombin_Analysis(dataset, target="Status"):
    dataset['Prothrombin_missing'] = dataset['Prothrombin'].isna().astype(int) # Missing Flag

    # Impute with Stage median
    dataset['Prothrombin'] = dataset.groupby('Stage')['Prothrombin'].transform(
        lambda x: x.fillna(x.median())
    )
    # outliers_check(dataset, 'Prothrombin')
    dataset, col = log_transform(dataset, "Prothrombin", False)
    # outliers_check(dataset, col)

    # Association with target
    # groups = [dataset.loc[dataset[target]==t, col] for t in dataset[target].unique()]
    # stat, p = stats.kruskal(*groups)
    # print(f"Kruskal-Wallis for {col} vs {target}: stat={stat:.3f}, p={p:.4e}")
    return dataset

**Stage**

In [88]:
def Stage_Analysis(dataset):
    # outliers_check(dataset, 'Stage')
    # print(dataset['Stage'].value_counts())

    # Correlation with Target
    # corr, p = spearmanr(dataset['Stage'], dataset['Status'])
    # print("Spearman correlation:", corr, "p-value:", p)

    return dataset

**Status (Target)**

In [89]:
def Status_Analysis(dataset):
    dataset = label_encoding(dataset, 'Status')
    # print(dataset['Status'].value_counts())
    return dataset

# Features and splitting

**Initialize Seed**

In [90]:
SEED = 42 # For reproducability
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fb2d6480d10>

**Split train/validation/test**

In [91]:
# Train dataset
X = train_dataset.drop(columns=['Status', 'id'])
y = train_dataset["Status"]

# Test dataset
x_test_data = test_df.drop(columns=['id'])

In [92]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)

# Datasets Preprocessing

**Imputing**

In [93]:
# Fit imputers on train
x_train_imputed, num_imputer, cat_imputer_dict = fit_predictive_imputer(x_train)

# Transform validation
x_val_imputed = transform_predictive_imputer(x_val, num_imputer, cat_imputer_dict)

# Transform test dataset
x_test_imputed = transform_predictive_imputer(x_test_data, num_imputer, cat_imputer_dict)



**Dataframe conversion**

In [94]:
# Columns of features
feature_cols = x_train.columns

# Convert imputed arrays to DataFrames
x_train_df = pd.DataFrame(x_train_imputed, columns=feature_cols, index=x_train.index)
x_val_df   = pd.DataFrame(x_val_imputed, columns=feature_cols, index=x_val.index)

# Combine with target
train_dataset = pd.concat([x_train_df, y_train], axis=1)
val_dataset   = pd.concat([x_val_df, y_val], axis=1)

# For test dataset
test_dataset = pd.DataFrame(x_test_imputed, columns=feature_cols, index=x_test_data.index)

**Apply analysis on features**

In [95]:
# def analysis(dataset, train):
#     dataset = Drug_Analysis(dataset)
#     dataset = Age_Analysis(dataset)
#     dataset = Sex_Analysis(dataset)
#     dataset = Ascites_Analysis(dataset)
#     dataset = Hepatomegaly_Analysis(dataset)
#     dataset = Spiders_Analysis(dataset)
#     dataset = Edema_Analysis(dataset)
#     dataset = Bilirubin_Analysis(dataset)
#     dataset = Albumin_Analysis(dataset)
#     dataset = Cholesterol_Analysis(dataset)
#     dataset = Copper_Analysis(dataset)
#     dataset = AlkPhos_Analysis(dataset)  
#     dataset = AlkPhos_Analysis(dataset)
#     dataset = Copper_Analysis(dataset)
#     dataset = Cholesterol_Analysis(dataset)
#     dataset = SGOT_Analysis(dataset)
#     dataset = Tryglicerides_Analysis(dataset)
#     dataset = Platelets_Analysis(dataset)
#     dataset = Prothrombin_Analysis(dataset)  t 
#     if train:
#         dataset = Status_Analysis(dataset)

#     return dataset

**Call all analysis**

In [96]:
# # Train preprocessing
# train_dataset = analysis(train_df, True)
# validation_dataset = analysis(val_df, False)

# # Test preprocessing
# test_dataset = analysis(test_df, False)

**Boxcox transformation**

In [97]:
# cols = ["Platelets", "Alk_Phos", "Copper", "Albumin", "Cholesterol", "Bilirubin", "Bilirubin", "Age"]
# # Training set
# train_dataset, lambdas, shifts = boxcox_train(train_dataset, cols)

# # Validation set
# val_dataset = boxcox_apply(val_dataset, cols, lambdas, shifts)

# # Test set
# test_dataset = boxcox_apply(test_dataset, cols, lambdas, shifts)

**Log transformation**

In [98]:
# log_cols = ['SGOT', 'Prothrombin', 'Tryglicerides']
# train_dataset, val_dataset, test_dataset = log_transform(
#     train_dataset, val_dataset, test_dataset, log_cols)

**One hot encoding**

In [99]:
# Fill the single value in Hepatomegaly 
train_dataset['Hepatomegaly'] = train_dataset['Hepatomegaly'].replace(
    'S', train_dataset['Hepatomegaly'].mode()[0])

cat_cols = ['Spiders', 'Hepatomegaly', 'Ascites', 'Sex', 'Drug', 'Edema']

train_dataset = one_hot_encoding(train_dataset, cat_cols)
val_dataset   = one_hot_encoding(val_dataset, cat_cols)
test_dataset  = one_hot_encoding(test_dataset, cat_cols)

# Make sure val/test match train columns
val_dataset  = val_dataset.reindex(columns=train_dataset.columns, fill_value=0)
test_dataset = test_dataset.reindex(columns=train_dataset.columns, fill_value=0)

  train_dataset['Hepatomegaly'] = train_dataset['Hepatomegaly'].replace(


**Label Encoding**

In [100]:
# Target encoding
train_dataset, val_dataset = label_encoding(train_dataset, val_dataset, "Status")

**Sampling**

In [101]:
# # Split
# majority = train_dataset[train_dataset.Status == 0]
# medium   = train_dataset[train_dataset.Status == 2]
# minority = train_dataset[train_dataset.Status == 1]

# # Upsample minority and medium to match majority
# medium_upsampled   = resample(medium,   replace=True, n_samples=len(majority), random_state=SEED)
# minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=SEED)

# # Combine
# train_balanced = pd.concat([majority, medium_upsampled, minority_upsampled])

# x_train_balanced = train_balanced.drop(columns="Status")
# y_train_balanced = train_balanced["Status"]


In [102]:
x_train_dataset = train_dataset.drop(columns="Status")
y_train_dataset = train_dataset["Status"]

**Feature scaling**

In [103]:
# Split val_df
x_val = val_dataset.drop(columns="Status")
y_val = val_dataset["Status"]

scaler = StandardScaler()
# Fit on train features
x_train_scaled = scaler.fit_transform(x_train_dataset)

# Ensure val and test have the same feature columns as train
x_val_scaled   = scaler.transform(x_val[x_train_dataset.columns])
x_test_scaled  = scaler.transform(test_dataset.reindex(columns=x_train_dataset.columns, fill_value=0))


# Train the model and predict

**Dataset wrapper**

In [104]:
class CirrhosisDataset(Dataset):
    def __init__(self, X, y):
        if hasattr(X, "values"): X = X.values
        if hasattr(y, "values"): y = y.values
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

**Neural network architecture**

In [111]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralNetwork, self).__init__()
        self.network = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.BatchNorm1d(64),
        nn.ReLU(),
        nn.Dropout(0.3),
    
        nn.Linear(64, 32),
        nn.BatchNorm1d(32),
        nn.ReLU(),
        nn.Dropout(0.3),
    
        nn.Linear(32, num_classes)
    )


    def forward(self, x):
        return self.network(x)

**Model training**

In [112]:
def train_evaluate_nn(x_train_scaled, y_train, 
                      x_val_scaled, y_val, 
                      x_test_scaled,
                      num_classes=3, num_epochs=300, batch_size=64,
                      lr=0.001, weight_decay=0.00001, patience=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Datasets & Loaders
    train_loader = DataLoader(CirrhosisDataset(x_train_scaled, y_train),
                              batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(CirrhosisDataset(x_val_scaled, y_val),
                            batch_size=batch_size, shuffle=False)

    # Model
    model = NeuralNetwork(x_train_scaled.shape[1], num_classes).to(device)

    # Class weights (handles imbalance)
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # OneCycleLR Scheduler (10x base lr)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=lr * 10,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
        anneal_strategy="cos"
    )

    best_loss = float('inf')
    counter = 0

    for epoch in range(num_epochs):
        # --- Training ---
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X_batch), y_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()  # step per batch (important for OneCycleLR)
            train_loss += loss.item()
        avg_train = train_loss / len(train_loader)

        # --- Validation ---
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                val_loss += criterion(outputs, y_batch).item()
                preds = outputs.argmax(dim=1)
                correct += (preds == y_batch).sum().item()
                total += y_batch.size(0)

        avg_val = val_loss / len(val_loader)
        val_acc = correct / total if total > 0 else 0.0

        # Logging
        if (epoch + 1) % 10 == 0 or epoch == 0:
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Epoch {epoch+1:03d}, LR: {current_lr:.6f}, "
                  f"Train Loss: {avg_train:.4f}, Val Loss: {avg_val:.4f}, Val Acc: {val_acc:.4f}")

        # Early stopping
        if avg_val < best_loss:
            best_loss = avg_val
            torch.save(model.state_dict(), "best_model.pth")
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered")
                break

    # Reload best model
    model.load_state_dict(torch.load("best_model.pth"))
    return model, device


**Training and predictions**

In [113]:
model, device = train_evaluate_nn(x_train_scaled, y_train_dataset, x_val_scaled, y_val, x_test_scaled)

Using device: cuda
Epoch 001, LR: 0.000403, Train Loss: 1.0137, Val Loss: 0.9100, Val Acc: 0.7658
Epoch 010, LR: 0.000690, Train Loss: 0.7716, Val Loss: 0.7475, Val Acc: 0.6971
Epoch 020, LR: 0.001523, Train Loss: 0.7526, Val Loss: 0.7523, Val Acc: 0.7398
Epoch 030, LR: 0.002800, Train Loss: 0.7159, Val Loss: 0.7636, Val Acc: 0.7402
Epoch 040, LR: 0.004367, Train Loss: 0.7108, Val Loss: 0.7529, Val Acc: 0.7253
Epoch 050, LR: 0.006034, Train Loss: 0.7152, Val Loss: 0.7542, Val Acc: 0.7380
Epoch 060, LR: 0.007601, Train Loss: 0.7038, Val Loss: 0.7505, Val Acc: 0.7116
Epoch 070, LR: 0.008878, Train Loss: 0.6992, Val Loss: 0.7891, Val Acc: 0.6976
Epoch 080, LR: 0.009711, Train Loss: 0.6897, Val Loss: 0.7959, Val Acc: 0.7700
Early stopping triggered


**Test set prediction**

In [114]:
def test_set_prediction(model, x_test, device):
    model.load_state_dict(torch.load("best_model.pth", weights_only=True))
    model.eval()
    with torch.no_grad():
        test_tensor = torch.tensor(x_test, dtype=torch.float32).to(device)
        y_prob = torch.softmax(model(test_tensor), dim=1).cpu().numpy()
        # Clip probabilities to avoid extreme log-loss
        y_prob = np.clip(y_prob, 1e-3, 1 - 1e-3)

    return y_prob

In [115]:
y_prob_submission = test_set_prediction(model, x_test_scaled, device)

# Submission

In [116]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'Status_C': y_prob_submission[:, 0],
    'Status_CL': y_prob_submission[:, 1],
    'Status_D': y_prob_submission[:, 2]
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("Submission file saved successfully")

Submission file saved successfully
