# Exploratory Data Analysis - (EDA)

In [None]:
print("First 5 Rows of Data:\n")
display(heart_attack_raw.head())
print("\n\n")
print("Last 5 Rows of Data:")
display(heart_attack_raw.tail())

In [None]:
heart_attack_raw.shape

In [None]:
heart_attack_raw.index

In [None]:
heart_attack_raw.info()

In [None]:
heart_attack_raw.describe()

In [None]:
print(heart_attack_raw.columns.tolist())

In [None]:
heart_attack_raw.head(0)

In [None]:
for column in heart_attack_raw:
    print(heart_attack_raw[column].value_counts())

**Checking for Duplicates and Null Values**

In [None]:
print(dq_checks(heart_attack_raw))

Duplicates

In [None]:
# How many duplicated rows are in the dataset
heart_attack_raw.duplicated().value_counts()

In [None]:
print(f"Amount of Duplicate Rows in the Dataset: {heart_attack_raw.duplicated().sum()}")

In [None]:
print(f"Duplicate Percentage of the Dataset: {(heart_attack_raw.duplicated().sum() / (len(heart_attack_raw)) * 100).round(2)}%")

Null Values

In [None]:
print(f"Total Amount of Null Cells in the Dataset: {heart_attack_raw.isna().sum().sum()}")

In [None]:
print(f"\033[1mThe Amount of Null Cells in each Column:\033[0m\n\n\
{heart_attack_raw.isna().sum().sort_values(ascending=False)}")

In [None]:
print("\033[1mHeart Attack Raw - Null Percentage & Row Significance\033[0m\n")

for column in heart_attack_raw:

    null_percentage = ((heart_attack_raw[column].isna().sum() / (len(heart_attack_raw))) * 100).round(2)
    
    
    if null_percentage > 5:
        print(f"{column}: {null_percentage}% - Significant \n")

    else:
        print(f"{column}: {null_percentage}% - Not Significant\n")

**Analysis into each Column**

Copying the Original Dataframe

In [None]:
print("\033[1mCopying the Original Dataframe:\033[0m\n")

try:
    heart_attack_raw_copy_plot = heart_attack_raw.copy()
    print("Data copied successfully.")
except:
    print("ERROR: The data has NOT been copied.")

In [None]:
# Checking the copied data is loaded correctly
heart_attack_raw_copy_plot.head()

In [None]:
heart_attack_raw_copy_plot.info()

In [None]:
print(f"Columns in the dataset that contain floats:\n\n {heart_attack_raw_copy_plot.select_dtypes(include=[float]).columns}")

Changing the data type to plot the frequencies of each column to check for any bias

In [None]:
heart_attack_raw_copy_plot = heart_attack_raw_copy_plot.astype({"PhysicalHealthDays": object,
                                                                "MentalHealthDays": object,
                                                                "SleepHours": object, 
                                                                "HeightInMeters": object,
                                                                "WeightInKilograms": object, 
                                                                "BMI": object})

Checking the dataframe types have been changed

In [None]:
heart_attack_raw_copy_plot.info()

In [None]:
heart_attack_raw_copy_plot.head(1)

In [None]:
heart_attack_raw_copy_plot.nunique().sort_values(ascending=False)

Plotting

In [None]:
print("Histogram Column Analysis")

# Setting the subplot layouts
plt.subplots(10, 4, figsize=(20, 20))

# Plotting histograms
for i, column in enumerate(heart_attack_raw_copy_plot.columns, 1):
    plt.subplot(10, 4, i)
    plt.hist((heart_attack_raw_copy_plot[column]), bins= 30, color='blue', edgecolor="k")
    plt.title(column)
    plt.ylabel("Frequency")
plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=5.0)

Changing the changed columns datatypes back to their originals

In [None]:
heart_attack_raw_copy_plot = heart_attack_raw_copy_plot.astype({"PhysicalHealthDays": float,
                                                                "MentalHealthDays": float,
                                                                "SleepHours": float, 
                                                                "HeightInMeters": float,
                                                                "WeightInKilograms": float, 
                                                                "BMI": float})

In [None]:
print("Boxplot Numerical Column Analysis")

# Setting the subplot layouts
plt.subplots(3, 2, figsize=(20, 10))

# Obtaining only numeric columns
numeric_columns = heart_attack_raw_copy_plot.select_dtypes(include=['number']).columns

# Plotting histograms
for i, column in enumerate(numeric_columns, 1):
    plt.subplot(3, 2, i)
    plt.boxplot(heart_attack_raw_copy_plot[column].dropna())
    plt.title(column)
    plt.ylabel("Value")
plt.tight_layout(pad=0.5, w_pad=0.5, h_pad=5.0)
plt.show()

In [None]:
sns.heatmap(heart_attack_raw_copy_plot.isnull(), cbar=False)
plt.title("Amount of Null Values (Black Square = 0 Null Values)")
plt.ylabel("Row Number")
plt.xlabel("Columns")
plt.show

**Dealing with Duplictes and Null Values**

Duplicates:

Through dataframe analysis, the ammount of rows that are duplicates count to 157. 

This is 0.04% of the total dataset, which is an insignificant amount. 

I will proceed in dropping the rows as doing so will NOT affect the overall quality of the data.

In [None]:
heart_attack_clean = heart_attack_raw_copy_plot.drop_duplicates()

In [None]:
heart_attack_clean.info()

In [None]:
heart_attack_clean.duplicated().sum()

Null Values:

There were 902665 null cells.

With a range of null cells in each column from < 5% - 18.54% of null values in a column.

This is % of the total dataset, which is

I will proceed to drop the rows, as the dataset is so big at 445132 rows and 40 columns, dropping the rows will still give a substantial dataset that can be used further.

In [None]:
heart_attack_clean = heart_attack_raw_copy_plot.dropna(keep=False)

In [None]:
heart_attack_clean.info()

In [None]:
heart_attack_clean.isna().sum()

In [None]:
sns.heatmap(heart_attack_clean.isnull(), cbar=False)
plt.title("Amount of Null Volumes (Black Square = 0 Null Values)")
plt.ylabel("Row Number")
plt.xlabel("Columns")
plt.show

---

**Saving the Clean Data**

In [None]:
print("\033[1mSaving the Cleaned Dataframe:\033[0m\n")

try:
    heart_attack_clean.to_csv('heart_attack_clean.csv')
    print("Data saved successfully.")
except:
    print("ERROR: The data has NOT been saved.")

---

**Correlation Matrix**

Raw Data Correlation Matrix

In [None]:
# Select only numerical columns
numeric_columns = heart_attack_raw_copy_plot.select_dtypes(include=['number'])

# Compute the correlation matrix
corr_matrix = numeric_columns.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Raw Numeric Columns Only, Correlation Matrix Heatmap')
plt.show()

Clean Data Correlation Matrix

In [None]:
# Select only numerical columns
numeric_columns = heart_attack_clean.select_dtypes(include=['number'])

# Compute the correlation matrix
corr_matrix = numeric_columns.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Clean Numeric Columns Only, Correlation Matrix Heatmap')
plt.show()

**Numeric Only Correlation Matrix - Comments**

Within the numerical columns only, the heatmap shows there is a correlation between:

- BMI and WeightinKilograms
- WeightinKilograms and HeightinMeters
- MentalHealthDays and PhysicalHealthDays

---