<table class="MsoTableGrid" border="1" cellspacing="0" cellpadding="0" width="100%" style="border-collapse:collapse;border:none;">
 <tbody>
  <tr>
   <td style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Project</b></td>
   <td style="border:solid windowtext 1.0pt;border-left:none;padding:5px;">Predictive Analysis for Heart Disease Related Fatalities</td>
   <td style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Developer</b></td>
   <td style="border:solid windowtext 1.0pt;border-left:none;padding:5px;">Patrick Ryan</td>
  </tr>
  <tr>
   <td style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Tools</b></td>
   <td style="border:solid windowtext 1.0pt;border-left:none;padding:5px;">Python 3.12</td>
   <td style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Version</b></td>
   <td style="border:solid windowtext 1.0pt;border-left:none;padding:5px;">[pre-release] v1</td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Description</b></td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;padding:5px;">The aim of this project is to develop a machine learning model to predict the probability of mortality, caused by heart failure using clinical parameters. This model will help in identifying patients at higher risk and eight healthcare professionals, and making informed decisions.</td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Libraries and Modules</b></td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;padding:5px;">
    <ul>
     <li>Data Handling and Computation Libraries: pandas, numpy</li>
     <li>Machine Learning and Data Preprocessing Libraries: scikit-learn</li>
     <li>Visualization Libraries: matplotlib.pyplot, seaborn</li>
     <!-- Will add more libraries and modules here as needed -->
    </ul>
   </td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Functions</b></td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;padding:5px;">
    <ol>
     <li>load_data: To load data into a DataFrame.</li>
     <!-- Will add more functions here as needed -->
    </ol>
   </td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;background:#D9D9D9;padding:5px;"><b>Main Workflow</b></td>
  </tr>
  <tr>
   <td colspan="4" style="border:solid windowtext 1.0pt;padding:5px;">
    <ul>
     <li>Setup:</li>
     <li>Data Loading:</li>
     <li>EDA:</li>
     <li>Data Preprocessing:</li>
     <li>Model Building:</li>
     <li>Model Evaluation:</li>
     <!-- Will add more steps here as needed -->
    </ul>
   </td>
  </tr>
  <!-- Can add more sections here as needed -->
 </tbody>
</table>


In [1]:
# Basic data manipulation and numerical operations
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations

# Logging and utilities
import logging  # For logging information during execution

# Data visualization
import matplotlib.pyplot as plt  # For plotting graphs and charts
import seaborn as sns  # For making statistical graphics

In [2]:
def load_data(data):
    try:
        df = pd.read_csv(data)
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        logging.error(f"Error: {str(e)}.")
        exit(1)
    return df

In [3]:
def identify_data_types(df, unique_value_threshold=10):
    numerical_columns = []  # Initialize a list to store names of numerical columns
    categorical_columns = []  # Initialize a list to store names of categorical columns

    for column in df.columns:
        if df[column].dtype == 'object':
            # Add column to categorical list if it's of object type
            categorical_columns.append(column)
        else:
            # If the number of unique values is less than or equal to the threshold, consider it categorical
            if df[column].nunique() <= unique_value_threshold:
                categorical_columns.append(column)
            else:
                # Otherwise, consider it a numerical column
                numerical_columns.append(column)
                
    return numerical_columns, categorical_columns

In [4]:
def exploratory_data_analysis(dataframe, numerical_columns, categorical_columns, target_column='HeartDisease', num_features=5):
    # Feature Correlation Analysis: Determine the correlation of each numerical feature with the target column
    correlation = dataframe[numerical_columns].corrwith(dataframe[target_column]).abs()
    top_features = correlation.sort_values(ascending=False).head(num_features).index.tolist()
    print(f"Top {num_features} features correlated with {target_column}: {top_features}")

    # Descriptive Statistics: Provide summary statistics for the top correlated numerical features
    print("\nDescriptive Statistics for Top Numerical Columns:\n", dataframe[top_features].describe())

    # Missing Values Analysis: Calculate and display the number and percentage of missing values in the top features
    missing_values = dataframe[top_features].isna().sum()
    missing_percentage = (missing_values / len(dataframe)) * 100
    missing_df = pd.DataFrame({'Number of Missing Values': missing_values, 'Percentage': missing_percentage})
    print("\nMissing Values Analysis for Top Features:\n", missing_df)

In [5]:
def main():
    print("Loading the dataset...")
    df = load_data("heart.csv")

    print("Identifying data types...")
    numerical_cols, categorical_cols = identify_data_types(df)
    
    print("Performing EDA...")
    exploratory_data_analysis(df, numerical_cols, categorical_cols)

In [6]:
if __name__ == "__main__":
    main()

Loading the dataset...
Identifying data types...
Performing EDA...
Top 5 features correlated with HeartDisease: ['Oldpeak', 'MaxHR', 'Age', 'Cholesterol', 'RestingBP']

Descriptive Statistics for Top Numerical Columns:
           Oldpeak       MaxHR         Age  Cholesterol   RestingBP
count  918.000000  918.000000  918.000000   918.000000  918.000000
mean     0.887364  136.809368   53.510893   198.799564  132.396514
std      1.066570   25.460334    9.432617   109.384145   18.514154
min     -2.600000   60.000000   28.000000     0.000000    0.000000
25%      0.000000  120.000000   47.000000   173.250000  120.000000
50%      0.600000  138.000000   54.000000   223.000000  130.000000
75%      1.500000  156.000000   60.000000   267.000000  140.000000
max      6.200000  202.000000   77.000000   603.000000  200.000000

Missing Values Analysis for Top Features:
              Number of Missing Values  Percentage
Oldpeak                             0         0.0
MaxHR                            