<a href="https://colab.research.google.com/github/Nikhilesh-075/6thSem-ML-Lab/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Suppress warnings for clarity
import warnings
warnings.filterwarnings('ignore')

# Upload files
from google.colab import files
uploaded = files.upload()

# Read uploaded datasets
# Change filenames if they differ after upload
diabetes_df = pd.read_csv("diabetes_data_upload.csv")
adult_df = pd.read_csv("adult.csv")

print("=== Diabetes Dataset ===")
print(diabetes_df.head())

print("\n=== Adult Dataset ===")
print(adult_df.head())

# ---------- 1. DATA CLEANING ----------

def handle_missing_values(df):
    print("\nMissing Values Before:\n", df.isnull().sum())
    df = df.dropna()  # or use fillna() for imputation
    print("\nMissing Values After:\n", df.isnull().sum())
    return df

def handle_categorical_data(df):
    df = pd.get_dummies(df, drop_first=True)
    return df

def handle_outliers(df, method="IQR"):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

# ---------- 2. DATA TRANSFORMATION ----------

def normalize_minmax(df):
    scaler = MinMaxScaler()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

def standardize(df):
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

# ------------ PROCESSING DIABETES DATASET -------------
print("\n--- Processing Diabetes Dataset ---")

# 1. Cleaning
diabetes_df_clean = handle_missing_values(diabetes_df)
diabetes_df_clean = handle_categorical_data(diabetes_df_clean)
diabetes_df_clean = handle_outliers(diabetes_df_clean)

# 2. Transformations
diabetes_minmax = normalize_minmax(diabetes_df_clean.copy())
diabetes_standard = standardize(diabetes_df_clean.copy())

print("\nDiabetes (Min-Max Normalized):\n", diabetes_minmax.head())
print("\nDiabetes (Standardized):\n", diabetes_standard.head())

# ------------ PROCESSING ADULT INCOME DATASET -------------
print("\n--- Processing Adult Income Dataset ---")

# 1. Cleaning
adult_df_clean = handle_missing_values(adult_df)
adult_df_clean = handle_categorical_data(adult_df_clean)
adult_df_clean = handle_outliers(adult_df_clean)

# 2. Transformations
adult_minmax = normalize_minmax(adult_df_clean.copy())
adult_standard = standardize(adult_df_clean.copy())

print("\nAdult Income (Min-Max Normalized):\n", adult_minmax.head())
print("\nAdult Income (Standardized):\n", adult_standard.head())
