In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("cirrhosis.csv")
data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [3]:
columns=data.columns
pd.DataFrame({'no of missing values':data.isnull().sum()})

Unnamed: 0,no of missing values
ID,0
N_Days,0
Status,0
Drug,106
Age,0
Sex,0
Ascites,106
Hepatomegaly,106
Spiders,106
Edema,0


In [4]:
data.select_dtypes(include=(['int64', 'float64'])).isna().sum()
data_num_col = data.select_dtypes(include=(['int64', 'float64'])).columns
for c in data_num_col:
    data[c].fillna(data[c].median(), inplace=True)

data.select_dtypes(include=(['int64', 'float64'])).isna().sum()

ID               0
N_Days           0
Age              0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64

In [5]:
data_cat_col = data.select_dtypes(include=('object')).columns
for c in data_cat_col:
    data[c].fillna(data[c].mode().values[0], inplace=True)
    
data.select_dtypes(include=('object')).isna().sum()

Status          0
Drug            0
Sex             0
Ascites         0
Hepatomegaly    0
Spiders         0
Edema           0
dtype: int64

In [6]:
# Converting Target categories into intigers 1 for Cirrhosis, 0 otherwise
data['Stage'] = np.where(data['Stage'] == 4,1,0)

In [7]:
# replacing catagorical data with intigers.
data['Sex'] = data['Sex'].replace({'M':0, 'F':1})                                # Male : 0 , Female :1
data['Ascites'] = data['Ascites'].replace({'N':0, 'Y':1})                        # N : 0, Y : 1   
data['Drug'] = data['Drug'].replace({'D-penicillamine':0, 'Placebo':1})          # D-penicillamine : 0, Placebo : 1   
data['Hepatomegaly'] = data['Hepatomegaly'].replace({'N':0, 'Y':1})              # N : 0, Y : 1
data['Spiders'] = data['Spiders'].replace({'N':0, 'Y':1})                        # N : 0, Y : 1
data['Edema'] = data['Edema'].replace({'N':0, 'Y':1, 'S':-1})                    # N : 0, Y : 1, S : -1
data['Status'] = data['Status'].replace({'C':0, 'CL':1, 'D':-1})                 # 'C':0, 'CL':1, 'D':-1

In [8]:
data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,-1,0,21464,1,1,1,1,1,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,1
1,2,4500,0,0,20617,1,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,0
2,3,1012,-1,0,25594,0,0,0,0,-1,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,1
3,4,1925,-1,0,19994,1,0,1,1,-1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,1
4,5,1504,1,1,13918,1,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,0


In [9]:
# Setting up Features and Target
X = data.drop(['Status', 'N_Days', 'Stage'], axis=1)
y = data.pop('Stage')

In [39]:
#X.to_csv('data_cirrhosis.csv', index=False)
#y.to_csv('data_y_cirrhosis.csv', index=False)

In [10]:
data = pd.read_csv("data_cirrhosis.csv")
data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,1,400,-1,0,21464,1,1,1,1,1,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2
1,2,4500,0,0,20617,1,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6
2,3,1012,-1,0,25594,0,0,0,0,-1,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0
3,4,1925,-1,0,19994,1,0,1,1,-1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3
4,5,1504,1,1,13918,1,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9


In [11]:
data_y = pd.read_csv("data_y_cirrhosis.csv")
data_y.head()

Unnamed: 0,Stage
0,1
1,0
2,1
3,1
4,0


In [12]:
X = data.drop(['ID','Status', 'N_Days'], axis=1)
X.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,0,21464,1,1,1,1,1,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2
1,0,20617,1,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6
2,0,25594,0,0,0,0,-1,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0
3,0,19994,1,0,1,1,-1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3
4,1,13918,1,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9


In [13]:
y = data_y
y.head()

Unnamed: 0,Stage
0,1
1,0
2,1
3,1
4,0


In [14]:
from Crypto.Cipher import AES
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

def encrypt_message(message, key):
    message = message.encode()
    padding = 16 - (len(message) % 16)
    message += bytes([padding] * padding)
    key = key[:16].encode()
    cipher = AES.new(key, AES.MODE_ECB)
    ciphertext = cipher.encrypt(message)
    return ciphertext

def decrypt_message(ciphertext, key):
    key = key[:16].encode()
    cipher = AES.new(key, AES.MODE_ECB)
    message = cipher.decrypt(ciphertext)
    padding = message[-1]
    message = message[:-padding]
    message = message.decode()
    return message

key = "liver cirrohosis is dangerous"

In [15]:
X_cols = list(X.columns)
for col in X_cols:
    X[col] = [encrypt_message(str(x), key) for x in X[col]]
    X[col] = [int.from_bytes(x, byteorder='big') for x in X[col]]
    X[col] = pd.to_numeric(X[col], errors='coerce').astype(float)
X.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,9.729243e+37,2.5740060000000002e+38,1.806296e+38,1.806296e+38,1.806296e+38,1.806296e+38,1.806296e+38,5.081155999999999e+37,1.7636090000000003e+38,2.160335e+38,2.266499e+38,1.562603e+38,1.213955e+38,3.455182e+37,1.821998e+38,2.255421e+38
1,9.729243e+37,7.599561e+37,1.806296e+38,9.729243e+37,1.806296e+38,1.806296e+38,9.729243e+37,1.135952e+38,1.7771810000000003e+38,2.7748510000000002e+38,1.02067e+38,8.134690999999999e+37,1.546224e+38,1.605874e+38,2.296206e+38,5.797907e+37
2,9.729243e+37,2.4598730000000003e+38,9.729243e+37,9.729243e+37,9.729243e+37,9.729243e+37,3.844882e+37,9.509321999999999e+37,2.853644e+38,3.580253e+37,1.311157e+38,2.3578920000000003e+38,9.972373999999999e+37,1.638876e+38,1.646029e+38,3.017185e+38
3,9.729243e+37,5.312236e+37,1.806296e+38,9.729243e+37,1.806296e+38,1.806296e+38,3.844882e+37,1.916774e+38,9.036889e+37,1.2234450000000001e+38,1.446252e+38,3.2041000000000003e+38,3.0683580000000003e+38,7.784329999999999e+37,2.619728e+38,2.011466e+38
4,1.806296e+38,5.152918999999999e+37,1.806296e+38,9.729243e+37,1.806296e+38,1.806296e+38,9.729243e+37,3.0586880000000002e+38,2.402369e+38,2.164681e+38,1.716561e+38,1.013666e+38,1.243222e+38,3.31228e+37,2.6677360000000003e+38,2.2884540000000002e+38


In [16]:
X.to_csv('encrypted_data_cirrhosis.csv', index=False)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42)

In [47]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [48]:
# fit model no training data
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

  return f(**kwargs)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
y_pred = xgb_model.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred.round())
print('Accuracy:', accuracy)

Accuracy: 0.6666666666666666


In [51]:
X.columns

Index(['Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema',
       'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT',
       'Tryglicerides', 'Platelets', 'Prothrombin'],
      dtype='object')

In [52]:
inp = [[0,32,1,1,1,1,1,1.2,22,33.4,55,333,121.4,22,44,2]]
cols = list(X.columns)
input_data = pd.DataFrame(inp,columns = cols) 
input_data

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin
0,0,32,1,1,1,1,1,1.2,22,33.4,55,333,121.4,22,44,2


In [53]:
for col in cols:
    input_data[col] = [encrypt_message(str(x), key) for x in input_data[col]]
    input_data[col] = [int.from_bytes(x, byteorder='big') for x in input_data[col]]        
    input_data[col] = pd.to_numeric(X[col], errors='coerce').astype(float)
print(xgb_model.predict(input_data.values))        

[1]


In [20]:
import pickle
#pickle.dump(xgb_model, open('cirrhosis_xgb.pkl', 'wb'))