In [400]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
%matplotlib inline

Load Dataset

In [401]:
df=pd.read_csv('../datasets/dataset_raw.csv')
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,2221,C,Placebo,18499,F,N,Y,N,N,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1
1,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2
2,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2
3,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2
4,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1


In [402]:
df.shape

(25000, 19)

In [403]:
df.columns=df.columns.str.strip().str.lower().str.replace(' ','_')

In [404]:
df_columns

Index(['n_days', 'status', 'drug', 'age', 'sex', 'ascites', 'hepatomegaly',
       'spiders', 'edema', 'bilirubin', 'cholesterol', 'albumin', 'copper',
       'alk_phos', 'sgot', 'tryglicerides', 'platelets', 'prothrombin',
       'stage'],
      dtype='object')

In [405]:
df.dtypes

n_days             int64
status            object
drug              object
age                int64
sex               object
ascites           object
hepatomegaly      object
spiders           object
edema             object
bilirubin        float64
cholesterol      float64
albumin          float64
copper           float64
alk_phos         float64
sgot             float64
tryglicerides    float64
platelets        float64
prothrombin      float64
stage              int64
dtype: object

In [406]:
df['age']= df['age'].astype(float)
df['n_days']= df['n_days'].astype(float)

In [407]:
df.dtypes

n_days           float64
status            object
drug              object
age              float64
sex               object
ascites           object
hepatomegaly      object
spiders           object
edema             object
bilirubin        float64
cholesterol      float64
albumin          float64
copper           float64
alk_phos         float64
sgot             float64
tryglicerides    float64
platelets        float64
prothrombin      float64
stage              int64
dtype: object

In [408]:
df.isnull().sum()

n_days           0
status           0
drug             0
age              0
sex              0
ascites          0
hepatomegaly     0
spiders          0
edema            0
bilirubin        0
cholesterol      0
albumin          0
copper           0
alk_phos         0
sgot             0
tryglicerides    0
platelets        0
prothrombin      0
stage            0
dtype: int64

Conclusion: No imputation required

Removing the dupliate

In [409]:
df.duplicated().sum()

np.int64(15361)

In [410]:
df=df.drop_duplicates()
df.shape

(9639, 19)

In [411]:
num_col=[col for col in df.columns if df[col].dtype!='object' and col!='stage']

In [412]:
def detect_outliers_iqr(data, col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)

    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    outliers = data[(data[col] < lower) | (data[col] > upper)]
    return lower, upper, outliers


In [413]:
for col in num_col:

    lower, upper, outliers = detect_outliers_iqr(df, col)

    print('Column =', col)
    print("Lower bound:", lower)
    print("Upper bound:", upper)
    print("Number of outliers:", outliers.shape[0], '\n')


Column = n_days
Lower bound: -1139.5
Upper bound: 4840.5
Number of outliers: 0 

Column = age
Lower bound: 7841.5
Upper bound: 28605.5
Number of outliers: 19 

Column = bilirubin
Lower bound: -2.95
Upper bound: 7.05
Number of outliers: 1192 

Column = cholesterol
Lower bound: 123.23415489999996
Upper bound: 517.2764085000001
Number of outliers: 863 

Column = albumin
Lower bound: 2.5850000000000004
Upper bound: 4.465
Number of outliers: 281 

Column = copper
Lower bound: -25.5
Upper bound: 178.5
Number of outliers: 953 

Column = alk_phos
Lower bound: -396.48365349999995
Upper bound: 3410.1394225
Number of outliers: 822 

Column = sgot
Lower bound: 22.475000000000023
Upper bound: 202.27499999999998
Number of outliers: 627 

Column = tryglicerides
Lower bound: 45.0
Upper bound: 173.0
Number of outliers: 1104 

Column = platelets
Lower bound: 9.5
Upper bound: 485.5
Number of outliers: 173 

Column = prothrombin
Lower bound: 8.350000000000001
Upper bound: 12.75
Number of outliers: 284 



Log tranformation over the columns:
- bilirubin
- cholesterol
- copper
- alk_phos
- sgot


In [414]:
df['bilirubin']= np.log1p(df['bilirubin'])
df['cholesterol']= np.log1p(df['cholesterol'])
df['copper']= np.log1p(df['copper'])
df['alk_phos']= np.log1p(df['alk_phos'])
df['sgot']= np.log1p(df['sgot'])
# df['tryglicerides']= np.log1p(df['tryglicerides'])  # this is actually increasing the outlier value

In [415]:
for col in ('bilirubin','cholesterol','copper','alk_phos','sgot'):
    
    lower, upper, outliers = detect_outliers_iqr(df, col)

    print('Column =', col)
    print("Lower bound:", lower)
    print("Upper bound:", upper)
    print("Number of outliers:", outliers.shape[0], '\n')

Column = bilirubin
Lower bound: -0.7184558717939775
Upper bound: 2.7648575593956135
Number of outliers: 411 

Column = cholesterol
Lower bound: 5.142182232443817
Upper bound: 6.378501789382964
Number of outliers: 743 

Column = copper
Lower bound: 2.926015814109115
Upper bound: 5.659956892701948
Number of outliers: 584 

Column = alk_phos
Lower bound: 5.959089711209099
Upper bound: 8.572861004095522
Number of outliers: 656 

Column = sgot
Lower bound: 3.907072999150047
Upper bound: 5.514238337905743
Number of outliers: 276 



Encoding the cat features

In [416]:
df=pd.get_dummies(
    df,
    columns=['status','drug','sex','ascites','hepatomegaly','spiders','edema']
)

In [417]:
# Independent-Dependent variables split

X=df.drop('stage',axis=1)
y=df['stage']

In [418]:
X.head()

Unnamed: 0,n_days,age,bilirubin,cholesterol,albumin,copper,alk_phos,sgot,tryglicerides,platelets,...,sex_M,ascites_N,ascites_Y,hepatomegaly_N,hepatomegaly_Y,spiders_N,spiders_Y,edema_N,edema_S,edema_Y
0,2221.0,18499.0,0.405465,5.010635,4.04,5.429346,6.395262,3.983413,57.0,256.0,...,False,True,False,False,True,True,False,True,False,False
1,1230.0,19724.0,0.405465,5.393628,3.93,3.135494,6.498282,3.828641,75.0,220.0,...,True,False,True,True,False,False,True,True,False,False
2,4184.0,11839.0,0.405465,5.771441,3.54,3.951244,7.126087,4.815836,80.0,225.0,...,False,True,False,True,False,True,False,True,False,False
3,2090.0,16467.0,0.530628,5.545177,3.74,3.178054,6.932448,4.363099,58.0,151.0,...,False,True,False,True,False,True,False,True,False,False
4,2105.0,21699.0,1.064711,6.188264,3.54,4.317488,6.959399,4.695925,109.0,151.0,...,False,True,False,False,True,True,False,True,False,False


Train test split

In [419]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y, random_state=42, test_size=0.2)

In [420]:
X_train.head()

Unnamed: 0,n_days,age,bilirubin,cholesterol,albumin,copper,alk_phos,sgot,tryglicerides,platelets,...,sex_M,ascites_N,ascites_Y,hepatomegaly_N,hepatomegaly_Y,spiders_N,spiders_Y,edema_N,edema_S,edema_Y
209,3445.0,20684.0,1.098612,5.590987,3.67,4.49981,6.626718,5.287509,90.0,269.0,...,False,True,False,False,True,True,False,True,False,False
630,1443.0,14975.0,0.788457,5.914882,2.8,4.591562,7.592697,4.816697,124.702128,120.0,...,True,False,True,True,False,False,True,True,False,False
563,1492.0,13995.0,1.435085,5.914882,3.56,4.356709,7.490529,4.945207,124.702128,309.0,...,False,True,False,True,False,True,False,True,False,False
3065,597.0,19724.0,1.458615,5.914882,2.73,4.591562,7.592697,4.816697,124.702128,325.0,...,True,False,True,True,False,False,True,False,True,False
1654,935.0,20736.0,1.648659,5.914882,3.19,4.591562,7.592697,4.816697,124.702128,382.0,...,False,True,False,True,False,True,False,True,False,False


Normalization

In [421]:
scale_col= [col for col in X_train.columns if df[col].dtype != 'object']
scale_col

['n_days',
 'age',
 'bilirubin',
 'cholesterol',
 'albumin',
 'copper',
 'alk_phos',
 'sgot',
 'tryglicerides',
 'platelets',
 'prothrombin',
 'status_C',
 'status_CL',
 'status_D',
 'drug_D-penicillamine',
 'drug_Placebo',
 'sex_F',
 'sex_M',
 'ascites_N',
 'ascites_Y',
 'hepatomegaly_N',
 'hepatomegaly_Y',
 'spiders_N',
 'spiders_Y',
 'edema_N',
 'edema_S',
 'edema_Y']

In [422]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [423]:
X_train_scaled= sc.fit_transform(X_train[scale_col])
X_test_scaled= sc.transform(X_test[scale_col])

In [424]:
X_train_scaled ## These are the num array, need to convert to the dataframe

array([[ 1.39527716,  0.61569801, -0.02573606, ...,  0.44194859,
        -0.38089978, -0.19518987],
       [-0.42686392, -0.93591576, -0.46512753, ...,  0.44194859,
        -0.38089978, -0.19518987],
       [-0.38226606, -1.20226389,  0.45093872, ...,  0.44194859,
        -0.38089978, -0.19518987],
       ...,
       [-1.41347777,  1.54601398,  0.27137162, ...,  0.44194859,
        -0.38089978, -0.19518987],
       [ 0.01183338,  0.42816718, -0.12347714, ..., -2.26270662,
         2.62536248, -0.19518987],
       [-0.19204254, -0.39452241, -0.53103168, ...,  0.44194859,
        -0.38089978, -0.19518987]], shape=(7711, 27))

In [425]:
X_train_scaled= pd.DataFrame(
    X_train_scaled,
    columns=scale_col,
    index=X_train.index
)

X_test_scaled= pd.DataFrame(
    X_test_scaled,
    columns=scale_col,
    index=X_test.index
)

In [426]:
X_train.head()

Unnamed: 0,n_days,age,bilirubin,cholesterol,albumin,copper,alk_phos,sgot,tryglicerides,platelets,...,sex_M,ascites_N,ascites_Y,hepatomegaly_N,hepatomegaly_Y,spiders_N,spiders_Y,edema_N,edema_S,edema_Y
209,3445.0,20684.0,1.098612,5.590987,3.67,4.49981,6.626718,5.287509,90.0,269.0,...,False,True,False,False,True,True,False,True,False,False
630,1443.0,14975.0,0.788457,5.914882,2.8,4.591562,7.592697,4.816697,124.702128,120.0,...,True,False,True,True,False,False,True,True,False,False
563,1492.0,13995.0,1.435085,5.914882,3.56,4.356709,7.490529,4.945207,124.702128,309.0,...,False,True,False,True,False,True,False,True,False,False
3065,597.0,19724.0,1.458615,5.914882,2.73,4.591562,7.592697,4.816697,124.702128,325.0,...,True,False,True,True,False,False,True,False,True,False
1654,935.0,20736.0,1.648659,5.914882,3.19,4.591562,7.592697,4.816697,124.702128,382.0,...,False,True,False,True,False,True,False,True,False,False


In [427]:
X_train=X_train.drop(columns=scale_col,axis=1)
X_test=X_test.drop(columns=scale_col,axis=1)

X_train=pd.concat([X_train,X_train_scaled],axis=1)
X_test=pd.concat([X_test,X_test_scaled],axis=1)

In [428]:
train_data= pd.concat([X_train,y_train], axis=1)
test_data= pd.concat([X_test,y_test], axis=1)

In [429]:
# reseting the index
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [431]:
train_data.to_csv('../datasets/train_datset.csv')
test_data.to_csv('../datasets/test_dataset.csv')

In [434]:
import pickle as pkl

with open('../models/StandardScaler.pkl','wb') as file:
    pkl.dump(sc,file)