## Best Practices for Data Preprocessing

#### Always Explore & Visualize Data First

In [None]:

from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
data = fetch_california_housing(as_frame=True)
df = data.frame
df.iloc[0:100, 0] = np.nan  
X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']
missing_values = X.isnull().sum()
print("Missing values per column:\n", missing_values)
print("\nSummary statistics (to detect outliers):\n", X.describe())
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

Missing values per column:
 MedInc        100
HouseAge        0
AveRooms        0
AveBedrms       0
Population      0
AveOccup        0
Latitude        0
Longitude       0
dtype: int64

Summary statistics (to detect outliers):
              MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20540.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.878913     28.639486      5.429000      1.096675   1425.476744   
std        1.898162     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846154      0.333333      3.000000   
25%        2.570250     18.000000      4.440716      1.006079    787.000000   
50%        3.542150     29.000000      5.229129      1.048780   1166.000000   
75%        4.750000     37.000000      6.052381      1.099526   1725.000000   
max       15.000100     52.000000    141.909091     34.066667  35682.000000   

           AveOccup      Latitude     Longitude  
count  206

## Handle Missing & Inconsistent Data Before Applying ML Models

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
data = fetch_california_housing(as_frame=True)
df = data.frame
df.iloc[0:100, 0] = np.nan 
X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']
X_dropped = X.dropna()
y_dropped = y[X_dropped.index] 
print("Shape after dropping missing values:", X_dropped.shape) 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_filled = imputer.fit_transform(X)
print("Shape after imputation:", X_filled.shape)
def cap_outliers(df, lower_percentile=0.01, upper_percentile=0.99):
    capped_df = df.copy()
    for col in df.columns:
        lower = df[col].quantile(lower_percentile)
        upper = df[col].quantile(upper_percentile)
        capped_df[col] = np.clip(df[col], lower, upper)
    return capped_df
X_capped = cap_outliers(pd.DataFrame(X_filled, columns=X.columns))
print("Outliers handled via capping.")






Shape after dropping missing values: (20540, 8)
Shape after imputation: (20640, 8)
Outliers handled via capping.


## Choose the Right Scaling Method

In [3]:

from sklearn.preprocessing import MinMaxScaler
import pandas as pd
data = {'feature1': [10, 20, 30, 40, 50],
        'feature2': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df)

from sklearn.preprocessing import RobustScaler
data = {'feature1': [10, 20, 30, 40, 500],  
        'feature2': [1, 2, 3, 4, 5]}

df = pd.DataFrame(data)
scaler = RobustScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

print(scaled_df)
from sklearn.preprocessing import MaxAbsScaler
data = {'feature1': [-30, -20, 0, 20, 30],
        'feature2': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)
scaler = MaxAbsScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
print(scaled_df)





   feature1  feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00
   feature1  feature2
0      -1.0      -1.0
1      -0.5      -0.5
2       0.0       0.0
3       0.5       0.5
4      23.5       1.0
   feature1  feature2
0 -1.000000       0.2
1 -0.666667       0.4
2  0.000000       0.6
3  0.666667       0.8
4  1.000000       1.0


## Keep Track of Data Transformations for Reproducibility

In [4]:


import logging
from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler
import pandas as pd
logging.basicConfig(filename='data_preprocessing.log', level=logging.INFO, format='%(asctime)s - %(message)s')
data = {'feature1': [10, 20, 30, 40, 50],
        'feature2': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)
def log_preprocessing_step(step_name, details):
    logging.info(f"Step: {step_name}, Details: {details}")
scaler_minmax = MinMaxScaler()
scaler_robust = RobustScaler()
scaler_maxabs = MaxAbsScaler()
scaler_minmax.fit(df)
scaled_data_minmax = scaler_minmax.transform(df)
log_preprocessing_step("Min-Max Scaling", {"min": scaler_minmax.data_min_, "max": scaler_minmax.data_max_})
scaler_robust.fit(df)
scaled_data_robust = scaler_robust.transform(df)
log_preprocessing_step("Robust Scaling", {"median": scaler_robust.center_, "IQR": scaler_robust.scale_})
scaler_maxabs.fit(df)
scaled_data_maxabs = scaler_maxabs.transform(df)
log_preprocessing_step("MaxAbs Scaling", {"max_abs": scaler_maxabs.max_abs_})
print(scaled_data_minmax)
print(scaled_data_robust)
print(scaled_data_maxabs)

#.
import pickle
def store_transformation_parameters(scaler, filename):
    with open(filename, 'wb') as file:
        pickle.dump(scaler, file)
store_transformation_parameters(scaler_minmax, 'minmax_scaler.pkl')
store_transformation_parameters(scaler_robust, 'robust_scaler.pkl')
store_transformation_parameters(scaler_maxabs, 'maxabs_scaler.pkl')
def load_transformation_parameters(filename):
    with open(filename, 'rb') as file:
        scaler = pickle.load(file)
    return scaler
loaded_scaler_minmax = load_transformation_parameters('minmax_scaler.pkl')
print(loaded_scaler_minmax.data_min_, loaded_scaler_minmax.data_max_)





[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [0.75 0.75]
 [1.   1.  ]]
[[-1.  -1. ]
 [-0.5 -0.5]
 [ 0.   0. ]
 [ 0.5  0.5]
 [ 1.   1. ]]
[[0.2 0.2]
 [0.4 0.4]
 [0.6 0.6]
 [0.8 0.8]
 [1.  1. ]]
[10.  1.] [50.  5.]
