# Data Download

In [3]:
!kaggle datasets download -d rashikrahmanpritom/heart-attack-analysis-prediction-dataset

In [4]:
!unzip heart-attack-analysis-prediction-dataset.zip

# Data Description
- Age : Age of the patient
- Sex : Sex of the patient
- exang: exercise induced angina (1 = yes; 0 = no)
- ca: number of major vessels (0-3)
- cp : Chest Pain type chest pain type
  - Value 1: typical angina
  - Value 2: atypical angina
  - Value 3: non-anginal pain
  - Value 4: asymptomatic
- trtbps : resting blood pressure (in mm Hg)
- chol : cholestoral in mg/dl fetched via BMI sensor
- fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- rest_ecg : resting electrocardiographic results
  - Value 0: normal
  - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
  - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- thalach : maximum heart rate achieved
- target : 0= less chance of heart attack 1= more chance of heart attack

# Data Analysis

In [7]:
import pandas as pd

In [8]:
data = pd.read_csv('Data/heart.csv')

In [9]:
data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [11]:
# 축약된 이름 풀어쓰기
new_column_names = { 'cp' : 'chest_pain', 'trtbps': 'rest_bps', 'chol': 'cholestoral', 'restecg': 'rest_electrocardiographic',
              'thalachh': 'max_heartrate', 'exng': 'angina', 'caa': 'major_vessels', 'thall': 'thallassemia' }
data = data.rename(columns=new_column_names)

In [12]:
data.dtypes

age                            int64
sex                            int64
chest_pain                     int64
rest_bps                       int64
cholestoral                    int64
fbs                            int64
rest_electrocardiographic      int64
max_heartrate                  int64
angina                         int64
oldpeak                      float64
slp                            int64
major_vessels                  int64
thallassemia                   int64
output                         int64
dtype: object

## Data Grouping

In [14]:
categorical_cols, numerical_cols = [], []
for col in data.columns:
    if (len(data[col].value_counts()) <= 5):
        categorical_cols.append(col)
    else:
        numerical_cols.append(col)
    print(f'{col}: {len(data[col].value_counts())}')

print('Categorical:', categorical_cols)
print('Numerical:', numerical_cols)

age: 41
sex: 2
chest_pain: 4
rest_bps: 49
cholestoral: 152
fbs: 2
rest_electrocardiographic: 3
max_heartrate: 91
angina: 2
oldpeak: 40
slp: 3
major_vessels: 5
thallassemia: 4
output: 2
Categorical: ['sex', 'chest_pain', 'fbs', 'rest_electrocardiographic', 'angina', 'slp', 'major_vessels', 'thallassemia', 'output']
Numerical: ['age', 'rest_bps', 'cholestoral', 'max_heartrate', 'oldpeak']


## Missing Value Check

In [16]:
print('Missing Value', data.isnull().sum(), sep='\n')

Missing Value
age                          0
sex                          0
chest_pain                   0
rest_bps                     0
cholestoral                  0
fbs                          0
rest_electrocardiographic    0
max_heartrate                0
angina                       0
oldpeak                      0
slp                          0
major_vessels                0
thallassemia                 0
output                       0
dtype: int64


## Gruop Anaysis

In [18]:
for col in categorical_cols:
    print(f'Unique Values in {data[col].value_counts()}\n')

print('#####' * 5, '\n')

print('Numerical Value')
print(data[numerical_cols].describe())

Unique Values in sex
1    207
0     96
Name: count, dtype: int64

Unique Values in chest_pain
0    143
2     87
1     50
3     23
Name: count, dtype: int64

Unique Values in fbs
0    258
1     45
Name: count, dtype: int64

Unique Values in rest_electrocardiographic
1    152
0    147
2      4
Name: count, dtype: int64

Unique Values in angina
0    204
1     99
Name: count, dtype: int64

Unique Values in slp
2    142
1    140
0     21
Name: count, dtype: int64

Unique Values in major_vessels
0    175
1     65
2     38
3     20
4      5
Name: count, dtype: int64

Unique Values in thallassemia
2    166
3    117
1     18
0      2
Name: count, dtype: int64

Unique Values in output
1    165
0    138
Name: count, dtype: int64

######################### 

Numerical Value
              age    rest_bps  cholestoral  max_heartrate     oldpeak
count  303.000000  303.000000   303.000000     303.000000  303.000000
mean    54.366337  131.623762   246.264026     149.646865    1.039604
std      9.082101

## 왜도와 첨도
- 왜도(Skewness)
  - 0: 정규분포
  - 양: 오른쪽으로 꼬리가 긴(왼쪽으로 치우친)
  - 음: 왼쪽으로 꼬리가 긴(오른쪽으로 치우친)

- 첨도(Kurtosis)
  - 0: 정규분포
  - 큰: 분포 뾰족
  - 작: 분포 평평

In [20]:
print('Skewness')
print(data[numerical_cols].skew(), end='\n\n')

print('#####' * 5, end='\n\n')

print('Kurtosis')
print(data[numerical_cols].kurt())

Skewness
age             -0.202463
rest_bps         0.713768
cholestoral      1.143401
max_heartrate   -0.537410
oldpeak          1.269720
dtype: float64

#########################

Kurtosis
age             -0.542167
rest_bps         0.929054
cholestoral      4.505423
max_heartrate   -0.061970
oldpeak          1.575813
dtype: float64


## Correlation Anaysis

In [22]:
print('Pearson correlation')
print(data[numerical_cols].corr(method='pearson'), end='\n\n')

print('#####' * 5, end='\n\n')

print('Spearman correlation')
print(data[numerical_cols].corr(method='spearman'))

Pearson correlation
                    age  rest_bps  cholestoral  max_heartrate   oldpeak
age            1.000000  0.279351     0.213678      -0.398522  0.210013
rest_bps       0.279351  1.000000     0.123174      -0.046698  0.193216
cholestoral    0.213678  0.123174     1.000000      -0.009940  0.053952
max_heartrate -0.398522 -0.046698    -0.009940       1.000000 -0.344187
oldpeak        0.210013  0.193216     0.053952      -0.344187  1.000000

#########################

Spearman correlation
                    age  rest_bps  cholestoral  max_heartrate   oldpeak
age            1.000000  0.285617     0.195786      -0.398052  0.268291
rest_bps       0.285617  1.000000     0.126562      -0.040407  0.154267
cholestoral    0.195786  0.126562     1.000000      -0.046766  0.045260
max_heartrate -0.398052 -0.040407    -0.046766       1.000000 -0.433241
oldpeak        0.268291  0.154267     0.045260      -0.433241  1.000000
