## DATA OVERVIEW

In [None]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data/raw/heart_disease_dataset.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1000 non-null   int64 
 1   Gender                   1000 non-null   object
 2   Cholesterol              1000 non-null   int64 
 3   Blood Pressure           1000 non-null   int64 
 4   Heart Rate               1000 non-null   int64 
 5   Smoking                  1000 non-null   object
 6   Alcohol Intake           660 non-null    object
 7   Exercise Hours           1000 non-null   int64 
 8   Family History           1000 non-null   object
 9   Diabetes                 1000 non-null   object
 10  Obesity                  1000 non-null   object
 11  Stress Level             1000 non-null   int64 
 12  Blood Sugar              1000 non-null   int64 
 13  Exercise Induced Angina  1000 non-null   object
 14  Chest Pain Type          1000 non-null   

- `Alcohol Intake` is a categorical variable with two levels: `Heavy` and `Moderate` and missing values as well.

----

| Column Name                 | Data Type | Description                                                          |
| --------------------------- | --------- | -------------------------------------------------------------------- |
| **Age**                     | int64     | Age of the individual in years                                       |
| **Gender**                  | object    | Gender of the individual (e.g., Male, Female)                        |
| **Cholesterol**             | int64     | Total cholesterol level (mg/dL)                                      |
| **Blood Pressure**          | int64     | Resting blood pressure (mm Hg)                                       |
| **Heart Rate**              | int64     | Resting heart rate (beats per minute)                                |
| **Smoking**                 | object    | Indicates whether the individual is a smoker (Yes/No)                |
| **Alcohol Intake**          | object    | Frequency or level of alcohol consumption (missing values present)   |
| **Exercise Hours**          | int64     | Average exercise duration per week (hours)                           |
| **Family History**          | object    | Family history of heart disease (Yes/No)                             |
| **Diabetes**                | object    | Indicates if the individual has diabetes (Yes/No)                    |
| **Obesity**                 | object    | Indicates obesity status (Yes/No)                                    |
| **Stress Level**            | int64     | Stress level on a numerical scale (higher = more stress)             |
| **Blood Sugar**             | int64     | Blood sugar level (mg/dL)                                            |
| **Exercise Induced Angina** | object    | Chest pain caused by physical exertion (Yes/No)                      |
| **Chest Pain Type**         | object    | Type of chest pain experienced (categorical medical classification)  |
| **Heart Disease**           | int64     | **Target variable**: 1 = Heart disease present, 0 = No heart disease |


----

In [5]:
data.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [6]:
data.describe()

Unnamed: 0,Age,Cholesterol,Blood Pressure,Heart Rate,Exercise Hours,Stress Level,Blood Sugar,Heart Disease
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,52.293,249.939,135.281,79.204,4.529,5.646,134.941,0.392
std,15.727126,57.914673,26.3883,11.486092,2.934241,2.831024,36.699624,0.488441
min,25.0,150.0,90.0,60.0,0.0,1.0,70.0,0.0
25%,39.0,200.0,112.75,70.0,2.0,3.0,104.0,0.0
50%,52.0,248.0,136.0,79.0,4.5,6.0,135.0,0.0
75%,66.0,299.0,159.0,89.0,7.0,8.0,167.0,1.0
max,79.0,349.0,179.0,99.0,9.0,10.0,199.0,1.0


In [14]:
data.describe(include = 'object')

Unnamed: 0,Gender,Smoking,Alcohol Intake,Family History,Diabetes,Obesity,Exercise Induced Angina,Chest Pain Type
count,1000,1000,660,1000,1000,1000,1000,1000
unique,2,3,2,2,2,2,2,4
top,Female,Never,Heavy,No,Yes,No,No,Non-anginal Pain
freq,503,338,346,501,505,501,528,256


In [8]:
data['Gender'].value_counts()

Gender
Female    503
Male      497
Name: count, dtype: int64

In [9]:
data['Smoking'].value_counts()

Smoking
Never      338
Current    336
Former     326
Name: count, dtype: int64

In [10]:
data['Alcohol Intake'].value_counts()

Alcohol Intake
Heavy       346
Moderate    314
Name: count, dtype: int64

In [11]:
data['Family History'].value_counts()

Family History
No     501
Yes    499
Name: count, dtype: int64

In [12]:
data['Diabetes'].value_counts()

Diabetes
Yes    505
No     495
Name: count, dtype: int64

In [13]:
data['Obesity'].value_counts()

Obesity
No     501
Yes    499
Name: count, dtype: int64