In [1]:
#Assignment 2: Cleaning and Preparing Healthcare Data for Analysis
#Objective:
#To clean a real-world healthcare dataset by handling inconsistencies, duplicates, and missing values.
#Instructions:

#Load the Dataset:
   #Read the healthcare dataset into a Pandas DataFrame.

#Handle Missing Data:
   #Identify missing values in patient demographics (age, gender, blood pressure, etc.).
   #Apply appropriate imputation methods.

#Detect and Handle Duplicates:
   #Identify duplicate records using duplicated().
   #Remove or merge duplicates as necessary.

#Detect and Handle Outliers:
   #Use boxplots to identify extreme values.
   #Apply transformations or capping techniques to handle outliers.

#Standardize and Normalize Data:
   #Convert categorical variables into numerical representations.
   #Scale numerical variables using Min-Max Scaling or Standard Scaling.

#Data Validation:
   #Ensure no missing values or duplicates remain.
   #Check data types and correct inconsistencies.

#Final Data Export:
  #Save the cleaned dataset as a CSV file for further analysis.


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler


file_path = r'C:\ML\Day_15_Healthcare_data.csv'
df = pd.read_csv(file_path)

# 2. Handle Missing Data
if 'age' in df.columns:
    df['age'].fillna(df['age'].mean(), inplace=True)

if 'gender' in df.columns:
    df['gender'].fillna(df['gender'].mode()[0], inplace=True)

if 'blood_pressure' in df.columns:
    df['blood_pressure'].fillna(df['blood_pressure'].mean(), inplace=True)

# 3. Detect and Handle Duplicates
duplicates = df.duplicated()
print(f"Number of duplicates: {duplicates.sum()}")
df.drop_duplicates(inplace=True)

# 4. Detect and Handle Outliers
if 'age' in df.columns:
    plt.boxplot(df['age'])
    plt.show()
    upper_limit = df['age'].quantile(0.95)
    lower_limit = df['age'].quantile(0.05)
    df['age'] = df['age'].apply(lambda x: upper_limit if x > upper_limit else lower_limit if x < lower_limit else x)

# 5. Standardize and Normalize Data
if 'gender' in df.columns:
    df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
if 'age' in df.columns and 'blood_pressure' in df.columns:
    # Min-Max Scaling
    scaler = MinMaxScaler()
    df[['age', 'blood_pressure']] = scaler.fit_transform(df[['age', 'blood_pressure']])

# 6. Data Validation
print(df.isnull().sum())
print(df.duplicated().sum())
print(df.dtypes)

if 'age' in df.columns:
    df['age'] = df['age'].astype(float)
if 'gender' in df.columns:
    df['gender'] = df['gender'].astype(int)

# 7. Final Data Export
df.to_csv('cleaned_healthcare_data.csv', index=False)


Number of duplicates: 5
Patient_ID         0
Age                0
Gender            21
Blood_Pressure    30
Cholesterol       20
Diabetes           0
Heart_Disease      0
dtype: int64
0
Patient_ID          int64
Age                 int64
Gender             object
Blood_Pressure    float64
Cholesterol       float64
Diabetes           object
Heart_Disease      object
dtype: object
