In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Define a function to cap outliers using the IQR method
def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    return df

# **DIABETES DATASET**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1. Load the Diabetes dataset
diabetes_df = pd.read_csv("diabetes.csv")
print("=== Diabetes Dataset: Original Data ===")
diabetes_df.head()

=== Diabetes Dataset: Original Data ===


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [None]:
# 2. Data Cleaning
# 2.a Handling Missing Values
# Check missing values (assumed as NaN)
print("\nMissing values in Diabetes dataset:")
print(diabetes_df.isnull().sum())

# For numerical columns: fill missing values with mean
num_cols = diabetes_df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    diabetes_df[col].fillna(diabetes_df[col].mean(), inplace=True)

# For categorical columns: fill missing values with mode
cat_cols = diabetes_df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    diabetes_df[col].fillna(diabetes_df[col].mode()[0], inplace=True)


Missing values in Diabetes dataset:
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_df[col].fillna(diabetes_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  diabetes_df[col].fillna(diabetes_df[col].mode()[0], inplace=True)


In [None]:
# 2.c Handling Outliers for numerical columns
for col in num_cols:
    diabetes_df = cap_outliers(diabetes_df, col)


In [None]:
# 3. Data Transformations

# Create copies for each scaling technique
diabetes_minmax = diabetes_df.copy()
diabetes_standard = diabetes_df.copy()

# Apply Min-Max Scaling
minmax_scaler = MinMaxScaler()
diabetes_minmax[num_cols] = minmax_scaler.fit_transform(diabetes_minmax[num_cols])



In [None]:
# 1. Load the Adult Income dataset
adult_income_df = pd.read_csv("adult.csv")
print("\n=== Adult Income Dataset: Original Data ===")
print(adult_income_df.head())


=== Adult Income Dataset: Original Data ===
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0            

In [None]:
# Apply Standard Scaling
standard_scaler = StandardScaler()
diabetes_standard[num_cols] = standard_scaler.fit_transform(diabetes_standard[num_cols])

print("\n=== Diabetes Dataset after Preprocessing (Min-Max Scaled) ===")
print(diabetes_minmax.head())

print("\n=== Diabetes Dataset after Preprocessing (Standard Scaled) ===")
print(diabetes_standard.head())



=== Diabetes Dataset after Preprocessing (Min-Max Scaled) ===
         ID  No_Pation Gender      AGE   Urea     Cr     HbA1c      Chol  \
0  0.627034   0.231118      F  0.34375  0.500  0.355  0.266892  0.406250   
1  0.918648   0.441444      M  0.00000  0.475  0.515  0.266892  0.328125   
2  0.524406   0.619508      F  0.34375  0.500  0.355  0.266892  0.406250   
3  0.849812   1.000000      F  0.34375  0.500  0.355  0.266892  0.406250   
4  0.629537   0.441470      M  0.00000  0.800  0.355  0.266892  0.515625   

         TG     HDL       LDL      VLDL       BMI CLASS  
0  0.127660  1.0000  0.209524  0.153846  0.204082     N  
1  0.234043  0.5000  0.342857  0.192308  0.163265     N  
2  0.127660  1.0000  0.209524  0.153846  0.204082     N  
3  0.127660  1.0000  0.209524  0.153846  0.204082     N  
4  0.148936  0.3125  0.323810  0.115385  0.081633     N  

=== Diabetes Dataset after Preprocessing (Standard Scaled) ===
         ID  No_Pation Gender       AGE      Urea        Cr     HbA1

In [None]:
# 2. Data Cleaning
# 2.a Handling Missing Values
# In this dataset, missing values might be represented by "?".
adult_income_df.replace("?", np.nan, inplace=True)
print("\nMissing values after replacing '?' with NaN:")
print(adult_income_df.isnull().sum())

# For numerical columns: fill missing values with mean
num_cols_adult = adult_income_df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols_adult:
    adult_income_df[col].fillna(adult_income_df[col].mean(), inplace=True)

# For categorical columns: fill missing values with mode
cat_cols_adult = adult_income_df.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols_adult:
    adult_income_df[col].fillna(adult_income_df[col].mode()[0], inplace=True)


Missing values after replacing '?' with NaN:
age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adult_income_df[col].fillna(adult_income_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adult_income_df[col].fillna(adult_income_df[col].mode()[0], inplace=True)


In [None]:
# 2.b Handling Categorical Data
# Encode categorical variables using One-Hot Encoding
adult_income_encoded = pd.get_dummies(adult_income_df, columns=cat_cols_adult)
print("\n=== Adult Income Dataset after One-Hot Encoding ===")
print(adult_income_encoded.head())


=== Adult Income Dataset after One-Hot Encoding ===
   age  fnlwgt  educational-num  capital-gain  capital-loss  hours-per-week  \
0   25  226802                7             0             0              40   
1   38   89814                9             0             0              50   
2   28  336951               12             0             0              40   
3   44  160323               10          7688             0              40   
4   18  103497               10             0             0              30   

   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
0                  False                False                   False   
1                  False                False                   False   
2                  False                 True                   False   
3                  False                False                   False   
4                  False                False                   False   

   workclass_Private  ...  native

In [None]:
# 2.c Handling Outliers for numerical columns in the encoded dataset
for col in num_cols_adult:
    adult_income_encoded = cap_outliers(adult_income_encoded, col)

In [None]:
# 3. Data Transformations

# Create copies for each scaling technique
adult_income_minmax = adult_income_encoded.copy()
adult_income_standard = adult_income_encoded.copy()

# Apply Min-Max Scaling
adult_income_minmax[num_cols_adult] = minmax_scaler.fit_transform(adult_income_minmax[num_cols_adult])


In [None]:
# Apply Standard Scaling
adult_income_standard[num_cols_adult] = standard_scaler.fit_transform(adult_income_standard[num_cols_adult])

print("\n=== Adult Income Dataset after Preprocessing (Min-Max Scaled) ===")
adult_income_minmax.head()

print("\n=== Adult Income Dataset after Preprocessing (Standard Scaled) ===")
adult_income_standard.head()


=== Adult Income Dataset after Preprocessing (Min-Max Scaled) ===

=== Adult Income Dataset after Preprocessing (Standard Scaled) ===


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_<=50K,income_>50K
0,-0.999145,0.419934,-1.270758,0.0,0.0,-0.192863,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
1,-0.045382,-1.017089,-0.456714,0.0,0.0,1.424021,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
2,-0.779046,1.575412,0.764352,0.0,0.0,-0.192863,False,True,False,False,...,False,False,False,False,False,True,False,False,False,True
3,0.394816,-0.27744,-0.049692,0.0,0.0,-0.192863,False,False,False,True,...,False,False,False,False,False,True,False,False,False,True
4,-1.51271,-0.873553,-0.049692,0.0,0.0,-1.405526,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False


In [None]:
# prompt: For both the datasets Diabetes and Adult income
# 1. Which columns in the dataset had missing values? How did you
# handle them ?
# 2. Which categorical columns did you identify in the dataset? How did
# you encode them ?
# 3. What is the difference between Min-Max Scaling and
# Standardization? When would you use one over the other?

# 1. Missing Values
# The code already handles missing values.
# For the Diabetes dataset, it fills numerical missing values with the mean
# and categorical missing values with the mode.
# For the Adult Income dataset, it first replaces "?" with NaN, then fills
# numerical missing values with the mean and categorical with the mode.

# 2. Categorical Columns and Encoding
# Diabetes Dataset:
# The 'Outcome' column is identified as categorical and encoded using Label Encoding.

# Adult Income Dataset:
# All identified categorical columns are encoded using One-Hot Encoding.  The code automatically detects and handles the categorical columns.

# 3. Min-Max Scaling vs. Standardization

# Difference:
# Min-Max Scaling scales the data to a specific range (usually 0 to 1).  It's sensitive to outliers.
# Standardization (Z-score normalization) transforms data to have a mean of 0 and a standard deviation of 1. It's less sensitive to outliers.

# When to use which:
# Min-Max Scaling is preferred when the algorithm is sensitive to the magnitude of the features (e.g., KNN, K-Means).  Use it when you know the data's distribution doesn't have significant outliers.
# Standardization is generally preferred if the data has a Gaussian-like distribution or when the algorithm is not sensitive to the feature scales (e.g., linear regression, logistic regression). Use it when you're concerned about outliers influencing the scaling.  Standardization is often more robust.


In [None]:
# prompt: For both the datasets Diabetes and Adult income
# 1. Which columns in the dataset had missing values? How did you
# handle them ?
# 2. Which categorical columns did you identify in the dataset? How did
# you encode them ?
# 3. What is the difference between Min-Max Scaling and
# Standardization? When would you use one over the other?
# also with code for each questions 1, 2, 3, not the entire

# 1. Missing Values (Diabetes Dataset)
print("\nMissing values in Diabetes dataset:")
print(diabetes_df.isnull().sum())
# Explanation: The code checks for missing values (NaN) in each column of the diabetes dataset and prints the count of missing values for each column.  Numerical columns are filled with the mean of the column, and categorical columns with the mode.

# 2. Categorical Columns and Encoding (Adult Income Dataset)
# Identify categorical columns
cat_cols_adult = adult_income_df.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical columns in Adult Income dataset:", cat_cols_adult)
# Explanation:  This part identifies categorical columns in the adult income dataset.
# One-Hot encoding is then applied to these columns using pd.get_dummies.


# 3. Min-Max Scaling vs. Standardization
# (Explanation provided in the original code comments)

# Example usage of MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Assuming 'numerical_features' is a list of your numerical columns
numerical_features = ['age', 'bmi']  # Example, replace with actual column names
minmax_scaler = MinMaxScaler()
scaled_data = minmax_scaler.fit_transform(diabetes_df[numerical_features])
print(scaled_data)


# Example usage of StandardScaler
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
scaled_data_standard = standard_scaler.fit_transform(diabetes_df[numerical_features])
scaled_data_standard



Missing values in Diabetes dataset:
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64

Categorical columns in Adult Income dataset: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


KeyError: "None of [Index(['age', 'bmi'], dtype='object')] are in the [columns]"