Basic Data Cleaning and preprocessing

In [1]:
import pandas as pd
import numpy as np 


In [2]:
data = pd.read_csv('Heart_attack_data(csv)')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [3]:
## FInding basic info 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         294 non-null    int64  
 1   sex         294 non-null    int64  
 2   cp          294 non-null    int64  
 3   trestbps    294 non-null    object 
 4   chol        294 non-null    object 
 5   fbs         294 non-null    object 
 6   restecg     294 non-null    object 
 7   thalach     294 non-null    object 
 8   exang       294 non-null    object 
 9   oldpeak     294 non-null    float64
 10  slope       294 non-null    object 
 11  ca          294 non-null    object 
 12  thal        294 non-null    object 
 13  num         294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


In [4]:
## Create a copy of original dataset  
df_copy = data.copy()

In [5]:
##Checking for null values
df_copy.isnull().sum()

age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
num           0
dtype: int64

In [8]:
df_copy.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num       '],
      dtype='object')

In [9]:
## Reducing whitespaces between column names
df_copy.columns = df_copy.columns.str.strip()

In [10]:
df_copy.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [11]:
## Converting categorical features into numerical features
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    object 
 4   chol      294 non-null    object 
 5   fbs       294 non-null    object 
 6   restecg   294 non-null    object 
 7   thalach   294 non-null    object 
 8   exang     294 non-null    object 
 9   oldpeak   294 non-null    float64
 10  slope     294 non-null    object 
 11  ca        294 non-null    object 
 12  thal      294 non-null    object 
 13  num       294 non-null    int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


In [None]:
## For 'trestbps' column
df_copy['trestbps'].unique()

array(['130', '120', '140', '170', '100', '105', '110', '125', '150',
       '98', '112', '145', '190', '160', '115', '142', '180', '132',
       '135', '?', '108', '124', '113', '122', '92', '118', '106', '200',
       '138', '136', '128', '155'], dtype=object)

In [15]:
df_copy['trestbps'].mode()

0    120
Name: trestbps, dtype: object

In [16]:
## We can replace it with mode  
df_copy['trestbps'] = df_copy['trestbps'].replace('?' , '120')

In [17]:
df_copy['trestbps'].unique()

array(['130', '120', '140', '170', '100', '105', '110', '125', '150',
       '98', '112', '145', '190', '160', '115', '142', '180', '132',
       '135', '108', '124', '113', '122', '92', '118', '106', '200',
       '138', '136', '128', '155'], dtype=object)

In [18]:
df_copy['trestbps'] = df_copy['trestbps'].astype(int)

In [19]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   chol      294 non-null    object 
 5   fbs       294 non-null    object 
 6   restecg   294 non-null    object 
 7   thalach   294 non-null    object 
 8   exang     294 non-null    object 
 9   oldpeak   294 non-null    float64
 10  slope     294 non-null    object 
 11  ca        294 non-null    object 
 12  thal      294 non-null    object 
 13  num       294 non-null    int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 32.3+ KB


In [20]:
## For 'chol' column
df_copy['chol'].unique()

array(['132', '243', '?', '237', '219', '198', '225', '254', '298', '161',
       '214', '220', '160', '167', '308', '264', '166', '340', '209',
       '260', '211', '173', '283', '194', '223', '315', '275', '297',
       '292', '182', '200', '204', '241', '339', '147', '273', '307',
       '289', '215', '281', '250', '184', '245', '291', '295', '269',
       '196', '268', '228', '358', '201', '249', '266', '186', '207',
       '218', '412', '224', '238', '230', '163', '240', '280', '257',
       '263', '276', '284', '195', '227', '253', '187', '202', '328',
       '168', '216', '129', '190', '188', '179', '210', '272', '180',
       '100', '259', '468', '274', '320', '221', '309', '312', '171',
       '208', '246', '305', '217', '365', '344', '394', '256', '326',
       '277', '270', '229', '85', '347', '251', '222', '287', '318',
       '213', '294', '193', '271', '156', '267', '282', '117', '466',
       '247', '226', '265', '206', '288', '303', '338', '248', '306',
       '529', '3

In [26]:
df_copy[~df_copy['chol'].str.isnumeric()]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
31,39,1,2,120,?,0,1,146,0,2.0,1,?,?,0
34,39,1,2,130,?,0,0,120,0,0.0,?,?,?,0
44,40,1,3,140,?,0,0,188,0,0.0,?,?,?,0
65,43,0,3,150,?,0,0,175,0,0.0,?,?,3,0
72,45,0,2,180,?,0,0,180,0,0.0,?,?,?,0
75,45,1,3,135,?,0,0,110,0,0.0,?,?,?,0
86,47,0,3,130,?,0,0,145,0,2.0,2,?,?,0
91,48,0,2,120,?,1,1,148,0,0.0,?,?,?,0
97,48,1,2,100,?,0,0,100,0,0.0,?,?,?,0


In [29]:
## We need to drop the column 'chol'  
df_copy = df_copy.drop(['chol'] , axis = 1)

In [30]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    object 
 5   restecg   294 non-null    object 
 6   thalach   294 non-null    object 
 7   exang     294 non-null    object 
 8   oldpeak   294 non-null    float64
 9   slope     294 non-null    object 
 10  ca        294 non-null    object 
 11  thal      294 non-null    object 
 12  num       294 non-null    int64  
dtypes: float64(1), int64(5), object(7)
memory usage: 30.0+ KB


In [32]:
##For 'fbs' column  
print(df_copy['fbs'].unique())
print(df_copy['fbs'].value_counts())

['0' '?' '1']
fbs
0    266
1     20
?      8
Name: count, dtype: int64


In [34]:
## We can replace '?' with '0'  
df_copy['fbs'] = df_copy['fbs'].str.replace('?' ,'0')

In [35]:
print(df_copy['fbs'].unique())
print(df_copy['fbs'].value_counts())

['0' '1']
fbs
0    274
1     20
Name: count, dtype: int64


In [36]:
df_copy['fbs'] = df_copy['fbs'].astype(int)

In [37]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    object 
 6   thalach   294 non-null    object 
 7   exang     294 non-null    object 
 8   oldpeak   294 non-null    float64
 9   slope     294 non-null    object 
 10  ca        294 non-null    object 
 11  thal      294 non-null    object 
 12  num       294 non-null    int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 30.0+ KB


In [38]:
##For restecg column
print(df_copy['restecg'].unique())
print(df_copy['restecg'].value_counts())

['2' '0' '1' '?']
restecg
0    235
1     52
2      6
?      1
Name: count, dtype: int64


In [39]:
## We can replace it with '?'  
df_copy['restecg'] = df_copy['restecg'].str.replace('?' ,'0')

In [40]:
df_copy['restecg'] = df_copy['restecg'].astype(int)

In [41]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    object 
 7   exang     294 non-null    object 
 8   oldpeak   294 non-null    float64
 9   slope     294 non-null    object 
 10  ca        294 non-null    object 
 11  thal      294 non-null    object 
 12  num       294 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 30.0+ KB


In [42]:
## for 'thalach' column
print(df_copy['thalach'].unique())
print(df_copy['thalach'].value_counts())

['185' '160' '170' '150' '165' '184' '155' '190' '168' '180' '178' '172'
 '130' '142' '98' '158' '129' '146' '145' '120' '106' '132' '140' '138'
 '167' '188' '144' '137' '136' '152' '175' '176' '118' '154' '115' '135'
 '122' '110' '90' '116' '174' '125' '?' '148' '100' '164' '139' '127'
 '162' '112' '134' '114' '128' '126' '124' '153' '166' '103' '156' '87'
 '102' '92' '99' '121' '91' '108' '96' '82' '105' '143' '119' '94']
thalach
150    29
140    21
130    17
170    14
160    13
       ..
91      1
82      1
105     1
143     1
119     1
Name: count, Length: 72, dtype: int64


In [45]:
df_copy['thalach'].mode()

0    150
Name: thalach, dtype: object

In [None]:
## We can replace '?' with mode value
df_copy['thalach'] = df_copy['thalach'].replace('?' , '150')

In [47]:
df_copy['thalach'] = df_copy['thalach'].astype(int)

In [48]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    int64  
 7   exang     294 non-null    object 
 8   oldpeak   294 non-null    float64
 9   slope     294 non-null    object 
 10  ca        294 non-null    object 
 11  thal      294 non-null    object 
 12  num       294 non-null    int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 30.0+ KB


In [49]:
## For 'exang' column
print(df_copy['exang'].unique())
print(df_copy['exang'].value_counts())

['0' '1' '?']
exang
0    204
1     89
?      1
Name: count, dtype: int64


In [50]:
## We can replace '?' with '0'  
df_copy['exang'] = df_copy['exang'].str.replace('?' ,'0')

In [51]:
df_copy['exang'] = df_copy['exang'].astype(int)

In [52]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    int64  
 7   exang     294 non-null    int64  
 8   oldpeak   294 non-null    float64
 9   slope     294 non-null    object 
 10  ca        294 non-null    object 
 11  thal      294 non-null    object 
 12  num       294 non-null    int64  
dtypes: float64(1), int64(9), object(3)
memory usage: 30.0+ KB


In [53]:
## For 'slope' column 
print(df_copy['slope'].unique())
print(df_copy['slope'].value_counts())

['?' '2' '1' '3']
slope
?    190
2     91
1     12
3      1
Name: count, dtype: int64


In [54]:
## It's better to drop this column  
df_copy = df_copy.drop(['slope'], axis = 1)

In [55]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    int64  
 7   exang     294 non-null    int64  
 8   oldpeak   294 non-null    float64
 9   ca        294 non-null    object 
 10  thal      294 non-null    object 
 11  num       294 non-null    int64  
dtypes: float64(1), int64(9), object(2)
memory usage: 27.7+ KB


In [56]:
## For 'ca' column 
print(df_copy['ca'].unique())
print(df_copy['ca'].value_counts())

['?' '0']
ca
?    291
0      3
Name: count, dtype: int64


In [57]:
## We can replace '?' with '0'  
df_copy['ca'] = df_copy['ca'].str.replace('?' ,'0')

In [58]:
df_copy['ca'] = df_copy['ca'].astype(int)

In [59]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    int64  
 7   exang     294 non-null    int64  
 8   oldpeak   294 non-null    float64
 9   ca        294 non-null    int64  
 10  thal      294 non-null    object 
 11  num       294 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 27.7+ KB


In [60]:
## For 'thal' column  
print(df_copy['thal'].unique())
print(df_copy['thal'].value_counts())

['?' '6' '3' '7']
thal
?    266
7     11
6     10
3      7
Name: count, dtype: int64


In [61]:
## We need to drop this column as well 
df_copy = df_copy.drop(['thal'],axis =1)

In [62]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       294 non-null    int64  
 1   sex       294 non-null    int64  
 2   cp        294 non-null    int64  
 3   trestbps  294 non-null    int64  
 4   fbs       294 non-null    int64  
 5   restecg   294 non-null    int64  
 6   thalach   294 non-null    int64  
 7   exang     294 non-null    int64  
 8   oldpeak   294 non-null    float64
 9   ca        294 non-null    int64  
 10  num       294 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 25.4 KB


In [63]:
## Now we need to save this preprocessed dataset  
df_copy.to_csv('Cleaned_Heart_attack_dataset.csv', index=False)
