<a href="https://colab.research.google.com/github/Satish1895/6thSem-ML-Lab/blob/main/1BM23CS306_Lab_1_DataProcessing_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def cap_outliers_iqr(df, col_name):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col_name] = np.clip(df[col_name], lower, upper)
    return df

# List of files to process
file_paths = ['/content/adult.csv', '/content/Dataset of Diabetes .csv']

for file_path in file_paths:
    print(f"\n--- Processing file: {file_path} ---")
    try:
        df = pd.read_csv(file_path)

        num_cols = df.select_dtypes(include=['int64', 'float64']).columns
        cat_cols = df.select_dtypes(include=['object']).columns

        missing_values = df.isnull().sum()
        missing_cols = missing_values[missing_values > 0].index.tolist()
        print(f"Columns with missing values for {file_path}:", missing_cols)

        # Handle missing numerical values
        if len(num_cols) > 0:
            df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

        # Handle missing categorical values
        if len(cat_cols) > 0:
            # Mode can return multiple values if there's a tie, so pick the first one
            df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

        print(f"Categorical columns for {file_path}:", list(cat_cols))

        # One-hot encode categorical columns
        if len(cat_cols) > 0:
            df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

        # Cap outliers for numerical columns
        for col_name in num_cols:
            df = cap_outliers_iqr(df, col_name)

        # Scale numerical columns
        scaler = MinMaxScaler()
        if len(num_cols) > 0:
            df[num_cols] = scaler.fit_transform(df[num_cols])

        print(f"Finished processing {file_path}. Shape of processed DataFrame: {df.shape}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")


--- Processing file: /content/adult.csv ---
Columns with missing values for /content/adult.csv: []
Categorical columns for /content/adult.csv: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
Finished processing /content/adult.csv. Shape of processed DataFrame: (48842, 101)

--- Processing file: /content/Dataset of Diabetes .csv ---
Columns with missing values for /content/Dataset of Diabetes .csv: []
Categorical columns for /content/Dataset of Diabetes .csv: ['Gender', 'CLASS']
Finished processing /content/Dataset of Diabetes .csv. Shape of processed DataFrame: (1000, 18)
