## Data Preprocessing :
1.check the datatypes of all columns - df.dtypes <br>
2.check how many null values are there in each column - df.isnull().sum() <br>
3.remove the rows which have missing value for the target variable <br>
4.remove the columns which have very few values based on percentage <br>
5.find the number of values of each type in all the columns <br>
6.find the mode of all the columns <br>
7.fill the missing values with mean/mode <br>
8.perform one hot encoding for categorical data <br>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("C:/Users/saksh/Downloads/hepatitis.csv")
print(df)
df.shape

# target variable = sgot

     age     sex steroid  antivirals fatigue malaise anorexia liver_big  \
0     30    male   False       False   False   False    False     False   
1     50  female   False       False    True   False    False     False   
2     78  female    True       False    True   False    False      True   
3     31  female     NaN        True   False   False    False      True   
4     34  female    True       False   False   False    False      True   
..   ...     ...     ...         ...     ...     ...      ...       ...   
150   46  female    True       False    True    True     True      True   
151   44  female    True       False    True   False    False      True   
152   61  female   False       False    True    True    False     False   
153   53    male   False       False    True   False    False      True   
154   43  female    True       False    True   False    False      True   

    liver_firm spleen_palpable spiders ascites varices  bilirubin  \
0        False           False

(155, 20)

In [5]:
df.dtypes

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
protime            float64
histology             bool
class               object
dtype: object

In [6]:
# finding the percentage of null values for each attribute
null_var = df.isnull().sum()/df.shape[0] * 100
print(null_var)

age                 0.000000
sex                 0.000000
steroid             0.645161
antivirals          0.000000
fatigue             0.645161
malaise             0.645161
anorexia            0.645161
liver_big           6.451613
liver_firm          7.096774
spleen_palpable     3.225806
spiders             3.225806
ascites             3.225806
varices             3.225806
bilirubin           3.870968
alk_phosphate      18.709677
sgot                2.580645
albumin            10.322581
protime            43.225806
histology           0.000000
class               0.000000
dtype: float64


In [7]:
# drop the rows that have missing value for the target variable

df.dropna(subset = ['sgot'], axis = 0, inplace = True)
df.reset_index(drop = True, inplace = True)

# drop the column if it has very few values
# finding the percentage of null values in each column and dropping the columns with percentage of null values > 15%
drop_columns = null_var[null_var > 15].keys()
drop_columns
df.drop(columns = drop_columns, inplace = True)
df.shape

(151, 18)

In [8]:
# finding the percentage of null values in each column to verify

null_var = df.isnull().sum()/df.shape[0] * 100
print(null_var)

age                0.000000
sex                0.000000
steroid            0.662252
antivirals         0.000000
fatigue            0.000000
malaise            0.000000
anorexia           0.000000
liver_big          5.298013
liver_firm         5.960265
spleen_palpable    2.649007
spiders            2.649007
ascites            2.649007
varices            2.649007
bilirubin          1.986755
sgot               0.000000
albumin            7.947020
histology          0.000000
class              0.000000
dtype: float64


In [9]:
# finding out how many values of each type are there in each column

for column in df.columns:
    print(df[column].value_counts())

age
30    8
38    8
36    7
34    7
39    6
50    6
45    5
37    5
54    5
28    5
42    5
51    5
44    5
32    4
47    4
31    4
27    4
56    3
33    3
52    3
20    3
23    3
41    3
40    3
49    3
25    3
35    2
22    2
48    2
24    2
57    2
58    2
61    2
62    2
65    1
26    1
64    1
67    1
59    1
60    1
66    1
78    1
7     1
69    1
72    1
70    1
46    1
53    1
43    1
Name: count, dtype: int64
sex
female    135
male       16
Name: count, dtype: int64
steroid
True     77
False    73
Name: count, dtype: int64
antivirals
False    127
True      24
Name: count, dtype: int64
fatigue
True     98
False    53
Name: count, dtype: int64
malaise
False    91
True     60
Name: count, dtype: int64
anorexia
False    120
True      31
Name: count, dtype: int64
liver_big
True     118
False     25
Name: count, dtype: int64
liver_firm
False    82
True     60
Name: count, dtype: int64
spleen_palpable
False    118
True      29
Name: count, dtype: int64
spiders
False    98
True     49

In [10]:
# finding out what are the highest frequesncy(mode) values of each column

for column in df.columns:
    print(f'{column} : {df[column].value_counts().idxmax()}')

age : 30
sex : female
steroid : True
antivirals : False
fatigue : True
malaise : False
anorexia : False
liver_big : True
liver_firm : False
spleen_palpable : False
spiders : False
ascites : False
varices : False
bilirubin : 1.0
sgot : 20.0
albumin : 4.0
histology : False
class : live


In [11]:
df.dtypes

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
sgot               float64
albumin            float64
histology             bool
class               object
dtype: object

In [12]:
#replacing all the nan values with the mean in the required columns

continuous_data_columns=[]
for column in df.columns:
    if df[column].dtype in ['int64','float64']:
        continuous_data_columns.append(column)
print(continuous_data_columns)

for column in df.columns:
    if column in continuous_data_columns:
        print(f'{column} : {df[column].mean()}')
        df[column].fillna(df[column].mean(), inplace = True)

#replacing all the nan values with the mode in the required columns
for column in df.columns:
    if df[column].dtype in ['object']:
        df[column].replace({'TRUE': 1, 'FALSE': 0}, inplace=True)
        print(f'{column} : {df[column].mode().iloc[0]}')
        df[column].fillna(df[column].mode().iloc[0], inplace = True)
        


['age', 'bilirubin', 'sgot', 'albumin']
age : 41.17218543046358
bilirubin : 1.4304054054054056
sgot : 85.89403973509934
albumin : 3.8172661870503606
sex : female
steroid : True
fatigue : True
malaise : False
anorexia : False
liver_big : True
liver_firm : False
spleen_palpable : False
spiders : False
ascites : False
varices : False
class : live


In [13]:
print(df)

     age     sex  steroid  antivirals  fatigue  malaise  anorexia  liver_big  \
0     30    male    False       False    False    False     False      False   
1     50  female    False       False     True    False     False      False   
2     78  female     True       False     True    False     False       True   
3     31  female     True        True    False    False     False       True   
4     34  female     True       False    False    False     False       True   
..   ...     ...      ...         ...      ...      ...       ...        ...   
146   46  female     True       False     True     True      True       True   
147   44  female     True       False     True    False     False       True   
148   61  female    False       False     True     True     False      False   
149   53    male    False       False     True    False     False       True   
150   43  female     True       False     True    False     False       True   

     liver_firm  spleen_palpable  spide

In [16]:
df.to_csv('C:/Users/saksh/Downloads/hepatitis_cleaned.csv')