In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [21]:
def data_preprocessing_pipeline(data):
    numeric_features= data.select_dtypes(include =['float','int']).columns
    
    categorical_features = data.select_dtypes(include=['object']).columns
    
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
    
    #using interquartile range to find the outliers in the the numeric feature
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound= Q1 - (1.5 * IQR)
        upper_bound = Q3 - (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])
   
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[numeric_features])
    data[numeric_features] = scaler.transform(data[numeric_features])
    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

In [24]:
data = pd.read_csv("data.csv")

print("Original Data:")
print(data)

Original Data:
   NumericFeature1  NumericFeature2 CategoricalFeature
0              1.0                7                  A
1              2.0                8                  B
2              NaN                9                NaN
3              4.0               10                  A
4              5.0               11                  B
5              6.0               50                  C


In [25]:
data.isnull()

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,False,False,False
1,False,False,False
2,True,False,True
3,False,False,False
4,False,False,False
5,False,False,False


In [26]:
#Perform data preprocessingdata_preprocessing_pipeline(data)
cleaned_data = data_preprocessing_pipeline(data)

print("Preprocessed Data:")
print(cleaned_data)

Preprocessed Data:
   NumericFeature1  NumericFeature2 CategoricalFeature
0        -2.236068        -2.236068                  A
1         0.447214         0.447214                  B
2         0.447214         0.447214                  A
3         0.447214         0.447214                  A
4         0.447214         0.447214                  B
5         0.447214         0.447214                  C
