# Data Preprocessing Pipeline

In [51]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [52]:
def preprocessing_pipeline(df):
    
    # Identifying numerical and categorical variables
    
    categorical_variables=df.select_dtypes(include=['object']).columns
    numerical_variables=df.select_dtypes(include=['float','int']).columns
    
    # Handling missing values in numerical and categorical features
    
    df[numerical_variables]=df[numerical_variables].fillna(df[numerical_variables].mean())
    df[categorical_variables]=df[categorical_variables].fillna(df[categorical_variables].mode().iloc[0])
    
    #Detect and handle outliers in numeric variables using IQR
    
    for variable in numerical_variables:
        Q1=df[variable].quantile(0.25)
        Q3=df[variable].quantile(0.75)
        IQR=Q3-Q1
        lower_bound=Q1-1.5*(IQR)
        upper_bound=Q3+1.5*(IQR)
        df[variable]=np.where((df[variable]<lower_bound) | (df[variable]>upper_bound),df[variable].mean(),df[variable])
                              
    #Normalize numeric features
                              
    scaler=StandardScaler()
    scaled_data=scaler.fit_transform(df[numerical_variables])
    df[numerical_variables]=scaler.transform(df[numerical_variables])

    return df                            
                    
 

In [53]:
import pandas as pd

In [56]:
df=pd.read_csv("sample_data.csv")
print("ORIGINAL DATA")
df


ORIGINAL DATA


Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,1.0,7,A
1,2.0,8,B
2,,9,
3,4.0,10,A
4,5.0,11,B
5,6.0,50,C


In [58]:
preprocessed_data=preprocessing_pipeline(df)
print("PREPROCESSED DATA")
preprocessed_data

PREPROCESSED DATA


Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,-1.535624,-1.611372,A
1,-0.9449991,-0.875523,B
2,3.700743e-17,-0.139675,A
3,0.2362498,0.596173,A
4,0.8268742,1.332022,B
5,1.417499,0.698375,C
