In [15]:
import pandas as pd

# Upload file
uploaded = files.upload()

# Load the dataset (assuming only one file is uploaded)
file_name = list(uploaded.keys())[0]  # Get the uploaded file name
df = pd.read_csv(file_name)

# Display basic information about the dataset
print("Initial Dataset Info:")
df.info()
print("\nFirst 5 Rows:")
display(df.head())

# Handling missing values
print("\nChecking missing values:")
print(df.isnull().sum())

# Fill missing values (example: fill numerical columns with median, categorical with mode)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Handle categorical columns safely
categorical_cols = df.select_dtypes(include=['object']).columns
if not categorical_cols.empty:
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("\nDuplicate rows removed.")

# Convert date columns to proper datetime format
for col in df.columns:
    if 'date' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Standardizing text data (example: making categorical data lowercase and stripping spaces)
for col in categorical_cols:
    df[col] = df[col].str.lower().str.strip()

# Detecting outliers using IQR (Interquartile Range)
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
print("\nOutliers detected:")
print(outliers)

# Save the cleaned dataset
# Save the cleaned dataset
cleaned_file_path = "healthcare_data_cleaned.csv"  # Save in the current working directory
df.to_csv(cleaned_file_path, index=False)
print(f"\nData cleaning complete. Cleaned file saved as '{cleaned_file_path}'.")

Saving healthcare_data.csv to healthcare_data (2).csv
Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PatientID      20 non-null     int64  
 1   Age            20 non-null     int64  
 2   BloodPressure  20 non-null     int64  
 3   SugarLevel     20 non-null     float64
 4   Weight         20 non-null     float64
dtypes: float64(2), int64(3)
memory usage: 932.0 bytes

First 5 Rows:


Unnamed: 0,PatientID,Age,BloodPressure,SugarLevel,Weight
0,1,44,118,87.892495,105.568034
1,2,39,109,177.321803,105.703426
2,3,49,149,144.148273,77.78707
3,4,58,121,90.355404,115.244784
4,5,35,109,126.4218,70.38379



Checking missing values:
PatientID        0
Age              0
BloodPressure    0
SugarLevel       0
Weight           0
dtype: int64

Duplicate rows removed.

Outliers detected:
PatientID        0
Age              0
BloodPressure    0
SugarLevel       0
Weight           0
dtype: int64

Data cleaning complete. Cleaned file saved as 'healthcare_data_cleaned.csv'.
