In [4]:
import pandas as pd

# Load the dataset
file_path = '/content/healthcare_data.csv'
df = pd.read_csv(file_path)

# Display basic information about the dataset
print("Initial Data Overview:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

# Data Cleaning Steps
# 1. Handling Missing Values
df.fillna(method='ffill', inplace=True)  # Forward fill for continuous data

# 2. Removing Duplicates
df.drop_duplicates(inplace=True)

# 3. Standardizing Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# 4. Converting Data Types (example for date columns)
if 'date_of_birth' in df.columns:
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')

# 5. Removing Outliers (example for age column)
if 'age' in df.columns:
    df = df[(df['age'] >= 0) & (df['age'] <= 120)]

# 6. Encoding Categorical Variables (example for gender)
if 'gender' in df.columns:
    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0}).fillna(-1)

# Display cleaned data overview
print("\nCleaned Data Overview:")
print(df.info())
print("\nFirst 5 cleaned rows:")
print(df.head())

# Save the cleaned dataset
cleaned_file_path = '/content/healthcare_data.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")


Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PatientID      20 non-null     int64  
 1   Age            20 non-null     int64  
 2   BloodPressure  20 non-null     int64  
 3   SugarLevel     20 non-null     float64
 4   Weight         20 non-null     float64
dtypes: float64(2), int64(3)
memory usage: 932.0 bytes
None

First 5 rows:
   PatientID  Age  BloodPressure  SugarLevel      Weight
0          1   44            118   87.892495  105.568034
1          2   39            109  177.321803  105.703426
2          3   49            149  144.148273   77.787070
3          4   58            121   90.355404  115.244784
4          5   35            109  126.421800   70.383790

Cleaned Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column         Non-

  df.fillna(method='ffill', inplace=True)  # Forward fill for continuous data
