In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")  

#  Data Cleaning & Preprocessing in Python

In [2]:
df = pd.read_csv(r"C:\python project\Heart patient\heart_disease.csv")

In [3]:
df.head(3)

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No


In [4]:
# Checking missing values are there in each column?
df.isnull().sum()

Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure       26
Low HDL Cholesterol       25
High LDL Cholesterol      26
Alcohol Consumption     2586
Stress Level              22
Sleep Hours               25
Sugar Consumption         30
Triglyceride Level        26
Fasting Blood Sugar       22
CRP Level                 26
Homocysteine Level        20
Heart Disease Status       0
dtype: int64

In [5]:
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill missing values in categorical columns with "Not mentioned"
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna("Not mentioned")



In [6]:
df.isnull().sum()

Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Exercise Habits         0
Smoking                 0
Family Heart Disease    0
Diabetes                0
BMI                     0
High Blood Pressure     0
Low HDL Cholesterol     0
High LDL Cholesterol    0
Alcohol Consumption     0
Stress Level            0
Sleep Hours             0
Sugar Consumption       0
Triglyceride Level      0
Fasting Blood Sugar     0
CRP Level               0
Homocysteine Level      0
Heart Disease Status    0
dtype: int64

In [7]:
df.duplicated().sum() 

0

In [8]:
# Generate basic summary statistics for all numeric columns
df.describe()


Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49.296259,149.75774,225.425577,29.077269,6.991329,250.734409,120.142213,7.472201,12.456271
std,18.167567,17.556265,43.51039,6.300156,1.751002,86.953954,23.558052,4.334601,4.3191
min,18.0,120.0,150.0,18.002837,4.000605,100.0,80.0,0.003647,5.000236
25%,34.0,134.0,187.0,23.668887,5.455288,176.0,99.0,3.6818,8.729771
50%,49.0,150.0,225.425577,29.077269,6.996016,250.734409,120.0,7.472201,12.421274
75%,65.0,165.0,263.0,34.509009,8.527938,326.0,141.0,11.244879,16.130968
max,80.0,180.0,300.0,39.996954,9.999952,400.0,160.0,14.997087,19.999037


In [9]:
# Create age groups
bins = [0, 30, 40, 50, 60, 100]
labels = ['<30', '30-40', '40-50', '50-60', '60+']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)


In [10]:
# Categorizing BMI values
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df['BMI_Category'] = df['BMI'].apply(categorize_bmi)


Basic Data Exploration

In [11]:
df.columns

Index(['Age', 'Gender', 'Blood Pressure', 'Cholesterol Level',
       'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'BMI',
       'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol',
       'Alcohol Consumption', 'Stress Level', 'Sleep Hours',
       'Sugar Consumption', 'Triglyceride Level', 'Fasting Blood Sugar',
       'CRP Level', 'Homocysteine Level', 'Heart Disease Status', 'Age_Group',
       'BMI_Category'],
      dtype='object')

In [12]:
avg_stats_heart = df.groupby('Heart Disease Status')[['Age', 'BMI']].mean()
print(avg_stats_heart)


                            Age        BMI
Heart Disease Status                      
No                    49.380176  29.015271
Yes                   48.960593  29.325261


In [13]:
avg_stats_blood = df.groupby('High Blood Pressure')[['Age', 'BMI']].mean()
print(avg_stats_blood)


                           Age        BMI
High Blood Pressure                      
No                   49.303768  29.014988
Not mentioned        50.961538  30.343758
Yes                  49.280234  29.132125


In [14]:
smoking_heart_disease = df.groupby(['Smoking', 'Heart Disease Status']).size()
print(smoking_heart_disease)


Smoking        Heart Disease Status
No             No                      3887
               Yes                      965
Not mentioned  No                        19
               Yes                        6
Yes            No                      4094
               Yes                     1029
dtype: int64


**Smoker have Sligthly more Heart disease**

# Export the data set in MySQl 

In [15]:
# export the data in mysql database
Database = "Heart_disease"
username = "root"
password = "9718"
host = "127.0.0.1"
# Create a connection to the database
try:
    con = create_engine(f"mysql+mysqlconnector://{username}:{password}@{host}/{Database}")
    print("Connection successful")
except Exception as e:
    print("Facing issue while connecting to database Check Manually", e)

#load the data into the database
try: 
    df.to_sql('heart_disease', con, index=False, if_exists='replace')
    print("Data written to database successfully")

except Exception as e:
    print("Facing issue while writing to database Check Manually", e)

In [16]:
# Saving the cleaned data to a new csv file
df.to_csv(r"C:\python project\Heart patient\heart_disease_cleaned.csv", index=False)