# Data Preprocessing
##### It involves transforming and manipulating raw data to improve its quality, consistency, and relevance for analysis. 
##### It encompasses several tasks
   <li>including handling missing values.
   <li>standardizing variables.
   <li>removing outliers.

In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler

In [23]:
def data_pre_processing_pipeline(input_data):
    
    # feature means input data
    numeric_features = input_data.select_dtypes(include=['float','int']).columns
#     Selecting numeric features:
#     The select_dtypes method is used to select columns that have data types of either 'float' or 'int'. The result is stored in the numeric_features variable.
    catagorical_features = input_data.select_dtypes(include=['object']).columns
#     categorical features:
#     The select_dtypes method is used again, this time to select columns that have the data type 'object'. The result is stored in the categorical_features variable.
    
    # Handling missing values in numeric features
    input_data[numeric_features] = input_data[numeric_features].fillna(input_data[numeric_features].mean())
    # filling the missing value with mean
    
    # Detecting and handling missing value in the numeric_features using IQR 
    # as we are working with every column, so we have to eliminate outliers of every 
    # column in the data set so we are using loop to iterate through every columns
    
    for feature in numeric_features:
        Q1 = input_data[feature].quantile(0.25) # taking 25% of every feature
        Q3 = input_data[feature].quantile(0.75) # taking 75% of every feature
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5*IQR)
        upper_bound = Q3 + (1.5*IQR)
        input_data[feature] = np.where((input_data[feature]<lower_bound) | (input_data[feature]>upper_bound),
                                       input_data[feature].mean,# if there is outlier then,those will be filled by the mean 
                                       input_data[feature] #if there is no outlier,then the data will not be changed
                                      )
       
    # np.where - it is a numpy funtion
    # Condition: The Boolean mask identifying outliers.
    # Value if condition is True: input_data[feature].mean(), meaning if the condition is True (a value is an outlier), replace it with the mean of the feature column.
    # Value if condition is False: input_data[feature], meaning if the condition is False (a value is within the bounds), keep the original value.
        
        #Normalizing numeric features
        scaler = StandardScaler() # Standardize features by removing the mean and scaling to unit variance. uses Z score
        scaled_data = scaler.fit_transform(input_data[numeric_features])
        input_data[numeric_features] = scaler.transform(input_data[numeric_features])
#         In essence, above three line:
#         Standardizes the numeric features in a DataFrame using the StandardScaler.
#         Ensures each feature has a mean of 0 and a standard deviation of 1.
#         Helps improve the performance and convergence of many machine learning algorithms, as features are on a comparable scale.
        
        #Handling missing values in categorical features
        input_data[catagorical_features] = input_data[catagorical_features].fillna(input_data[catagorical_features].mode().iloc[0])
        
        
        return input_data

In [26]:
data = pd.read_csv('/Users/admin/PycharmProjects/About_DataScience/excel_data_sets/dip_Stat.csv')
print("Orginal Data")
data

Orginal Data


Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,1.0,7,A
1,2.0,8,B
2,,9,
3,4.0,10,A
4,5.0,11,B
5,6.0,50,C


In [32]:
data.describe()

Unnamed: 0,NumericFeature1,NumericFeature2
count,6.0,6.0
mean,-3.700743e-17,0.0
std,1.095445,1.095445
min,-1.535624,-0.576053
25%,-0.7087493,-0.494536
50%,0.1181249,-0.413019
75%,0.6792181,-0.331502
max,1.417499,2.228129


In [27]:
# performing pre-processing
cleaned_data = data_pre_processing_pipeline(data)
print("Preprocessed Data:")
print(cleaned_data)

Preprocessed Data:
   NumericFeature1  NumericFeature2 CategoricalFeature
0        -1.535624        -0.576053                  A
1        -0.944999        -0.510839                  B
2         0.000000        -0.445626                  A
3         0.236250        -0.380412                  A
4         0.826874        -0.315199                  B
5         1.417499         2.228129                  C


In [31]:
cleaned_data.describe()

Unnamed: 0,NumericFeature1,NumericFeature2
count,6.0,6.0
mean,-3.700743e-17,0.0
std,1.095445,1.095445
min,-1.535624,-0.576053
25%,-0.7087493,-0.494536
50%,0.1181249,-0.413019
75%,0.6792181,-0.331502
max,1.417499,2.228129
