In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
file_path = 'cirrhosis_drop.csv'
cirrhosis_data = pd.read_csv(file_path)

cirrhosis_data.head()

Unnamed: 0,ID,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status,Drug,Sex,Ascites,Hepatomegaly,Spiders,Edema
0,1,400,21464,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,D,D-penicillamine,F,Y,Y,Y,Y
1,2,4500,20617,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,C,D-penicillamine,F,N,Y,Y,N
2,3,1012,25594,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,D,D-penicillamine,M,N,N,N,S
3,4,1925,19994,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,D,D-penicillamine,F,N,Y,Y,S
4,5,1504,13918,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,CL,Placebo,F,N,Y,Y,N


In [12]:
# Identify numerical and categorical columns
numerical_cols = cirrhosis_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = cirrhosis_data.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols.remove('ID')
numerical_cols, categorical_cols

(['N_Days',
  'Age',
  'Bilirubin',
  'Cholesterol',
  'Albumin',
  'Copper',
  'Alk_Phos',
  'SGOT',
  'Tryglicerides',
  'Platelets',
  'Prothrombin',
  'Stage'],
 ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema'])

In [13]:
# Standardizing numerical features
scaler = StandardScaler()
cirrhosis_data_scaled = cirrhosis_data.copy()
cirrhosis_data_scaled[numerical_cols] = scaler.fit_transform(cirrhosis_data_scaled[numerical_cols])

In [14]:
# Dictionary to hold conversion tables for each non-numeric column
conversion_tables = {}

# Convert each non-numeric column to numeric values
for column in categorical_cols:
    # Generating a mapping from unique string values to numeric values
    unique_values = cirrhosis_data_scaled[column].unique()
    mapping_dict = {value: idx for idx, value in enumerate(unique_values)}

    # Applying the mapping to the column
    cirrhosis_data_scaled[column] = cirrhosis_data_scaled[column].map(mapping_dict)

    # Storing the conversion table
    conversion_tables[column] = mapping_dict

In [15]:
cirrhosis_data_scaled.head(), conversion_tables

(   ID    N_Days       Age  Bilirubin  Cholesterol   Albumin    Copper  \
 0   1 -1.432360  0.827904   2.485914    -0.491215 -2.194559  0.684867   
 1   2  2.223526  0.608395  -0.476689    -0.305613  1.478942 -0.512297   
 2   3 -0.886652  1.898239  -0.410362    -0.876001 -0.095416  1.318660   
 3   4 -0.072549  0.446938  -0.321927    -0.568172 -2.337683 -0.394928   
 4   5 -0.447946 -1.127724   0.031817    -0.409731  0.023854  0.532288   
 
    Alk_Phos      SGOT  Tryglicerides  Platelets  Prothrombin     Stage  \
 0 -0.123847  0.271931       0.764998  -0.757816     1.470371  1.104369   
 1  2.532642 -0.159629      -0.593622  -0.430953    -0.125301 -0.036569   
 2 -0.686329 -0.467356      -1.127366  -1.169029     1.270912  1.104369   
 3  1.936935 -1.093939      -0.528926  -0.831623    -0.424490  1.104369   
 4 -0.613796 -0.166165      -0.852407  -1.327189     0.173887 -0.036569   
 
    Status  Drug  Sex  Ascites  Hepatomegaly  Spiders  Edema  
 0       0     0    0        0         

In [16]:
cirrhosis_data_scaled.to_csv('cirrhosis_clean.csv', index=False)