# Exploratory Data Analysis 

<h1>#IT18257328</h1>

<b>Import Libraries</b>

In [6]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [7]:
import warnings
warnings.filterwarnings('ignore')

<b> Import csv file</b>

In [8]:
heartData = pd.read_csv('heart.csv')

In [9]:
heartData #Show dataset

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


<b>show columns names and Types</b>

In [10]:
heartData.columns.values.tolist()

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

## Information on the attributes of the heart disease data set

> 1. age 
> 2. sex (1= male; 0=female)
> 3. chest pain type (4 values) 
> 4. resting blood pressure 
> 5. serum cholestoral in mg/dl 
> 6. fasting blood sugar > 120 mg/dl
> 7. resting electrocardiographic results (values 0,1,2)
> 8. maximum heart rate achieved 
> 9. exercise induced angina 
> 10. oldpeak = ST depression induced by exercise relative to rest 
> 11. the slope of the peak exercise ST segment 
> 12. number of major vessels (0-3) colored by flourosopy 
> 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect (thallium heart scan or stress test)
> 14. target (0 = no heart disease; 1 = heart disease)

## Rename columns in CSV file

In [11]:
heartData = heartData.rename(columns={"cp": "chestPain", "trestbps": "bloodPressure", "fbs": "bloodSugar", "ca": "Vessels", "chol": "Cholesterol"})
#rename columns
#cp = chest pain
#trestbps = blood Pressure Level
#fbs = blood Sugar Level
#ca = Vessels 
#chol = Cholesterol Level

In [12]:
heartData['health_status'] = heartData['target']  #create new Column "health_status"

In [13]:
heartData['health_status'] = ["healthy" if x == 0 else "sick" for x in heartData['health_status']] #Assign values as char

In [14]:
heartData['gender'] = heartData['sex']  #create new Column "gender"

In [15]:
heartData['gender'] = ['F' if x == 0 else 'M' for x in heartData['gender']] #Assign values as char

In [16]:
heartData.head()#show first five values

Unnamed: 0,age,sex,chestPain,bloodPressure,Cholesterol,bloodSugar,restecg,thalach,exang,oldpeak,slope,Vessels,thal,target,health_status,gender
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,sick,M
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,sick,M
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,sick,F
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,sick,M
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,sick,F


In [17]:
heartData.tail()#show last five values

Unnamed: 0,age,sex,chestPain,bloodPressure,Cholesterol,bloodSugar,restecg,thalach,exang,oldpeak,slope,Vessels,thal,target,health_status,gender
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,healthy,F
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,healthy,M
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,healthy,M
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,healthy,M
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0,healthy,F


In [18]:
heartData.shape

(303, 16)

In [19]:
heartData.describe() # show full details of dataset

Unnamed: 0,age,sex,chestPain,bloodPressure,Cholesterol,bloodSugar,restecg,thalach,exang,oldpeak,slope,Vessels,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [20]:
heartData.dtypes #show datatypes

age                int64
sex                int64
chestPain          int64
bloodPressure      int64
Cholesterol        int64
bloodSugar         int64
restecg            int64
thalach            int64
exang              int64
oldpeak          float64
slope              int64
Vessels            int64
thal               int64
target             int64
health_status     object
gender            object
dtype: object

**Find & show duplicate values**

In [21]:
heartData[heartData.duplicated(keep=False)]

Unnamed: 0,age,sex,chestPain,bloodPressure,Cholesterol,bloodSugar,restecg,thalach,exang,oldpeak,slope,Vessels,thal,target,health_status,gender
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1,sick,M
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1,sick,M


In [22]:
heartData = heartData.drop_duplicates(keep='first') #Drop duplicate values

In [23]:
heartData['health_status'].value_counts()#Find health status of peoples

sick       164
healthy    138
Name: health_status, dtype: int64