In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset
data_path = r'C:\Users\ASUS\Downloads\Projects-20240722T093004Z-001\Projects\liver_cirrhosis_stage\liver_cirrhosis_stage\liver_cirrhosis.csv'  # Update this with the actual file path
df = pd.read_csv(data_path)

# Convert Age from Days to Years
df['Age'] = df['Age'] / 365  

# Map categorical variables to numeric
df['Sex'] = df['Sex'].map({'M': 1, 'F': 0})  # Male = 1, Female = 0
df['Status'] = df['Status'].map({'C': 0, 'CL': 1, 'D': 2})  # Censored=0, Transplant=1, Death=2
df['Edema'] = df['Edema'].map({'N': 0, 'S': 1, 'Y': 2})  

print(df.isnull().sum())  # Count missing values per column


# Summary of Cleaned Data
print(df.info())
print(df.describe())


  from pandas.core import (


N_Days           0
Status           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         25000 non-null  int64  
 1   Status         25000 non-null  int64  
 2   Drug           25000 non-null  object 
 3   Age            25000 non-null  float64
 4   Sex            25000 non-null  int64  
 5   Ascites        25000 non-null  object 
 6   Hepatomegaly   25000 non-null  object 
 7   Spiders        25000 non-null  object 
 8   Edema          25000 non-null  int64  
 9   Bilirubin      25000 non-nul

In [2]:
numeric_df = df.select_dtypes(include=['number'])  # Keep only numeric columns
print(numeric_df.head())


   N_Days  Status        Age  Sex  Edema  Bilirubin  Cholesterol  Albumin  \
0    2221       0  50.682192    0      0        0.5        149.0     4.04   
1    1230       0  54.038356    1      0        0.5        219.0     3.93   
2    4184       0  32.435616    0      0        0.5        320.0     3.54   
3    2090       2  45.115068    0      0        0.7        255.0     3.74   
4    2105       2  59.449315    0      0        1.9        486.0     3.54   

   Copper  Alk_Phos    SGOT  Tryglicerides  Platelets  Prothrombin  Stage  
0   227.0     598.0   52.70           57.0      256.0          9.9      1  
1    22.0     663.0   45.00           75.0      220.0         10.8      2  
2    51.0    1243.0  122.45           80.0      225.0         10.0      2  
3    23.0    1024.0   77.50           58.0      151.0         10.2      2  
4    74.0    1052.0  108.50          109.0      151.0         11.5      1  


In [3]:
df['Drug'] = df['Drug'].astype('category').cat.codes
df['Ascites'] = df['Ascites'].astype('category').cat.codes
df['Hepatomegaly'] = df['Hepatomegaly'].astype('category').cat.codes
df['Spiders'] = df['Spiders'].astype('category').cat.codes
