# CATEGORICAL DATA EXERCISE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import pandas as pd

# Problem: The Heart Dataset

## Dataset Description

File name: 'D3_Heart_Dataset.csv'

This dataset has been obtained from Kaggle: https://www.kaggle.com/fedesoriano/heart-failure-prediction

The data contains 918 observations with 12 attributes as described below:
1. Age: patient's age, range: 28 to 77.
2. Sex: patient's gender, M(79%), F(21%).
3. ChestPainType: ASY (54%), NAP (22%), Other(24%).
4. RestingBP: resting blood pressure, range: 0 to 200.
5. Cholestrol: serum cholestrol, range: 0 to 603.
6. FastingBS: fasting blood sugar, 0 or 1.
7. RestingECG: resting electrocardiogram results, Normal (60%), LVH (20%), Other (19%).
8. MaxHR: maximum heart rate achieved, range: 60 to 202.
9. ExerciseAngina: exercise induced angina, true(317-40%), false (547-60%).
10. OldPeak: old peak=ST, range: -2.6 to 6.2.
11. ST_Slope: ST slope, Up or flat.
12. HeartDisease: target, 1 or 0.

Last column indicates presence of heart disease given the remaining 11 attributes.

This is a binary classification problem.

Contains categorical data, otherwise the dataset is clean.

## Loading Data

In [None]:
#Reading the file into a dataframe
data=pd.read_csv('/content/drive/MyDrive/ML_PP/PP1/D3_Heart_Dataset.csv')
#Displaying the read contents
data

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


## Exploring Data

In [None]:
#Finding datatype of data
type(data)

pandas.core.frame.DataFrame

In [None]:
data.info()
#This information shows that each column has 918 entries.
#Non of the columns contain any 'null' value.
#There are 5 attributes with datatype of 'object'/'string'.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Gender          918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


## Encoding Categorical Data

### Consider the 'ExerciseAngina' column first and apply dummy variable encoding.
- This is a binary variable as its presence counts towards increased risk of heart disease and absence means otherwise.
- Can be ecoded using dummy variable encoding.

In [None]:
#The following method is used to count the possible values in a column
data['ExerciseAngina'].value_counts()
#This shows that there are two possible values for this attribute: 'Y' or 'N'.
#Also there are 547 entries for 'N' and 371 entries for 'Y'.

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [None]:
#The simplest way to encode 'ExerciseAngina' as dummy varaible is to use the replace method.
data['ExerciseAngina']=data['ExerciseAngina'].replace('Y',1)
data['ExerciseAngina']=data['ExerciseAngina'].replace('N',0)
data
#Observe that values of the column 'ExerciseAngina' have been cahnged to 0 and 1.

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,0,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,0,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,0,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,1,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,0,0.0,Flat,1


### Now consider the 'ChestPainType' column and apply ordinal encoding.

In [None]:
data['ChestPainType'].value_counts()
#This column contains 4 different values ASY, NAP, ATA, TA

4    496
3    203
2    173
1     46
Name: ChestPainType, dtype: int64

Let us use Ordinal encoding as follows:
- TA (typical angina): 1
- ATA (atypical angina): 2
- NAP (non-anginal pain): 3
- ASY (asymptomatic): 4

In [None]:
data['ChestPainType']=data['ChestPainType'].replace('TA',1)
data['ChestPainType']=data['ChestPainType'].replace('ATA',2)
data['ChestPainType']=data['ChestPainType'].replace('NAP',3)
data['ChestPainType']=data['ChestPainType'].replace('ASY',4)
data

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,2,140,289,0,Normal,172,0,0.0,Up,0
1,49,F,3,160,180,0,Normal,156,0,1.0,Flat,1
2,37,M,2,130,283,0,ST,98,0,0.0,Up,0
3,48,F,4,138,214,0,Normal,108,1,1.5,Flat,1
4,54,M,3,150,195,0,Normal,122,0,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,1,110,264,0,Normal,132,0,1.2,Flat,1
914,68,M,4,144,193,1,Normal,141,0,3.4,Flat,1
915,57,M,4,130,131,0,Normal,115,1,1.2,Flat,1
916,57,F,2,130,236,0,LVH,174,0,0.0,Flat,1


### Now consider the 'Gender' column and apply one-hot encoding.

In [None]:
#get_dummies is a simple method in pandas which can achieve this task.
data=pd.get_dummies(data, columns=['Gender'])

#The resulting table has two dummy variable encoded columns Gender_F and Gender_M, in place of one column Gender

In [None]:
#All the cahnges that we have made so far are done on the dataframe, and not in the original csv file.
#The to_csv method can be used to save the dataframe into a csv file.
data.to_csv('D3_Heart_Dataset_Clean.csv')

In [None]:
#TASK FOR YOU
#Try different types of encoding on the remaining categorical features.

In [None]:
data['RestingECG'].value_counts()


1    552
3    188
2    178
Name: RestingECG, dtype: int64

In [None]:
data['RestingECG']=data['RestingECG'].replace('Normal',1)
data['RestingECG']=data['RestingECG'].replace('ST',2)
data['RestingECG']=data['RestingECG'].replace('LVH',3)
data

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Gender_F,Gender_M
0,40,2,140,289,0,1,172,0,0.0,Up,0,0,1
1,49,3,160,180,0,1,156,0,1.0,Flat,1,1,0
2,37,2,130,283,0,2,98,0,0.0,Up,0,0,1
3,48,4,138,214,0,1,108,1,1.5,Flat,1,1,0
4,54,3,150,195,0,1,122,0,0.0,Up,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,1,132,0,1.2,Flat,1,0,1
914,68,4,144,193,1,1,141,0,3.4,Flat,1,0,1
915,57,4,130,131,0,1,115,1,1.2,Flat,1,0,1
916,57,2,130,236,0,3,174,0,0.0,Flat,1,1,0


In [None]:
data['ST_Slope'].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [None]:
data['ST_Slope']=data['ChestPainType'].replace('Flat',1)
data['ST_Slope']=data['ChestPainType'].replace('Up',2)
data['ST_Slope']=data['ChestPainType'].replace('Down',3)
data

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Gender_F,Gender_M
0,40,2,140,289,0,1,172,0,0.0,2,0,0,1
1,49,3,160,180,0,1,156,0,1.0,3,1,1,0
2,37,2,130,283,0,2,98,0,0.0,2,0,0,1
3,48,4,138,214,0,1,108,1,1.5,4,1,1,0
4,54,3,150,195,0,1,122,0,0.0,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,1,132,0,1.2,1,1,0,1
914,68,4,144,193,1,1,141,0,3.4,4,1,0,1
915,57,4,130,131,0,1,115,1,1.2,4,1,0,1
916,57,2,130,236,0,3,174,0,0.0,2,1,1,0


In [None]:
data=pd.get_dummies(data, columns=['ST_Slope'])


In [None]:
data

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,0,0,0,1
1,49,F,NAP,160,180,0,Normal,156,N,1.0,1,0,1,0
2,37,M,ATA,130,283,0,ST,98,N,0.0,0,0,0,1
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,1,0,1,0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,1,0,1,0
914,68,M,ASY,144,193,1,Normal,141,N,3.4,1,0,1,0
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,1,0,1,0
916,57,F,ATA,130,236,0,LVH,174,N,0.0,1,0,1,0
