# Group Exercise 1 - Data Preprocessing on a Real Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv('BigMart_Sales.csv')
print("Dataset shape:", df.shape)
print("\nColumn types:")
print(df.dtypes)

Dataset shape: (5681, 11)

Column types:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object


## 1. Handling Missing Values 

(Done by Shruti Bhandari)

In [2]:
print("Missing values:")
print(df.isnull().sum())

Missing values:
Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


In [3]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

print("Missing values after imputation:")
print(df.isnull().sum())

Missing values after imputation:
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)


## 2. Scaling Numerical Features 

(Done by Shruti Bhandari)

### 2.1 Z-Score Standardization

In [4]:
numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']

scaler_z = StandardScaler()
df_zscore = pd.DataFrame(
    scaler_z.fit_transform(df[numerical_cols]),
    columns=[f'{col}_zscore' for col in numerical_cols]
)

print("Z-score standardization (first 5 rows):")
print(df_zscore.head())

Z-score standardization (first 5 rows):
   Item_Weight_zscore  Item_Visibility_zscore  Item_MRP_zscore  \
0        1.897460e+00               -1.134077        -0.536555   
1       -1.035530e+00               -0.531850        -0.868937   
2        4.486336e-01                0.661316         1.629848   
3       -1.267578e+00               -0.981416         0.226697   
4        4.184768e-16                1.032540         1.508110   

   Outlet_Establishment_Year_zscore  
0                          0.139891  
1                          1.095512  
2                          0.020438  
3                          1.095512  
4                         -1.532446  


### 2.2 Minâ€“Max Normalization

In [5]:
scaler_mm = MinMaxScaler()
df_minmax = pd.DataFrame(
    scaler_mm.fit_transform(df[numerical_cols]),
    columns=[f'{col}_minmax' for col in numerical_cols]
)

print("Min-Max normalization (first 5 rows):")
print(df_minmax.head())

Min-Max normalization (first 5 rows):
   Item_Weight_minmax  Item_Visibility_minmax  Item_MRP_minmax  \
0            0.964275                0.023374         0.323413   
1            0.222983                0.118737         0.235849   
2            0.598095                0.307674         0.894140   
3            0.164335                0.047548         0.524488   
4            0.484706                0.366458         0.862069   

   Outlet_Establishment_Year_minmax  
0                          0.583333  
1                          0.916667  
2                          0.541667  
3                          0.916667  
4                          0.000000  


### 2.3 Explaining the Difference

In [6]:
print("Difference:")
print("\nZ-score: (x - mean) / std, range unbounded, mean=0, std=1")
print("Min-Max: (x - min) / (max - min), range [0,1]")

Difference:

Z-score: (x - mean) / std, range unbounded, mean=0, std=1
Min-Max: (x - min) / (max - min), range [0,1]
