In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

missing_values = ["na", "n/a", "NA", "unknown", ""]

df= pd.read_csv("unclean_penguins.csv", na_values=missing_values)
print("📦 Original dataset shape:", df.shape)
print("🧼 Nulls and invalid values:")
print(df.isnull().sum())

📦 Original dataset shape: (210, 7)
🧼 Nulls and invalid values:
species                0
island                 0
bill_length_mm       151
bill_depth_mm        110
flipper_length_mm    133
body_mass_g          105
sex                   66
dtype: int64


In [None]:
df_dropna = df.dropna()
print("\n📉 Dataset shape after dropna:", df_dropna.shape)


📉 Dataset shape after dropna: (2, 7)


In [None]:
df_imputed = df.copy()

# Fill numeric columns with mean
numeric_columns = df_imputed.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df_imputed[col].fillna(df_imputed[col].mean(), inplace=True)

# Fill categorical columns with mode
categorical_columns = df_imputed.select_dtypes(exclude=[np.number]).columns
for col in categorical_columns:
    df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)



print("\n✅ Dataset shape after imputation:", df_imputed.shape)
print("\n📊 Sample of imputed data:")
print(df_imputed.head())

print("\n✅ Missing values after imputation:")
print(df_imputed.isnull().sum())




✅ Dataset shape after imputation: (210, 7)

📊 Sample of imputed data:
     species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0     Adelie  Torgersen       45.308475         16.700         212.000000   
1     Adelie  Torgersen       45.308475         17.600         188.000000   
2  Chinstrap  Torgersen       45.308475         16.879         191.000000   
3     Gentoo  Torgersen       45.308475         16.879         201.506494   
4  Chinstrap      Dream       45.100000         16.100         201.506494   

    body_mass_g     sex  
0  99999.000000    MALE  
1   4238.000000  female  
2  46332.838095    MALE  
3  46332.838095  female  
4   5047.000000    MALE  

✅ Missing values after imputation:
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(df_imputed[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)


In [None]:
# Normalize text values in 'sex' column
df_imputed['sex'] = df_imputed['sex'].str.strip().str.lower()
df_imputed['sex'] = df_imputed['sex'].replace({'male': 'Male', 'female': 'Female'})
print(df_imputed.head())


     species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0     Adelie  Torgersen       45.308475         16.700         212.000000   
1     Adelie  Torgersen       45.308475         17.600         188.000000   
2  Chinstrap  Torgersen       45.308475         16.879         191.000000   
3     Gentoo  Torgersen       45.308475         16.879         201.506494   
4  Chinstrap      Dream       45.100000         16.100         201.506494   

    body_mass_g     sex  
0  99999.000000    Male  
1   4238.000000  Female  
2  46332.838095    Male  
3  46332.838095  Female  
4   5047.000000    Male  


In [None]:
scaler = MinMaxScaler()
scaled_df = df_imputed.copy()

# Apply scaling to numeric columns
scaled_df[numeric_columns] = scaler.fit_transform(df_imputed[numeric_columns])

print("\n📈 Scaled values (first 5 rows):")
print(scaled_df[numeric_columns].head())


📈 Scaled values (first 5 rows):
   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
0        0.442715       0.413793           0.711864     1.000000
1        0.442715       0.517241           0.305085     0.012570
2        0.442715       0.434368           0.355932     0.446627
3        0.442715       0.434368           0.534008     0.446627
4        0.432692       0.344828           0.534008     0.020912
