## **Are u ready to Prepare the data ?**

* To make sure that data is clean and in a good shape for ml algorithm u need to follow some steps: 
 
 * **1. Data cleaning:**
 
   * Fix or remove outliers (optional).
   * Fill in missing values (e.g., with zero, mean, median…) or drop thei rows (or columns).
   #

* **2. Feature selection (optional):**
  * Drop the attributes that provide no useful information for the task.
  #
*  **3. Feature engineering, where appropriate:**
  * Discretize continuous features
  * Decompose features (e.g., categorical, date/time, etc.).
  * Add promising transformations of features (e.g., log(x), sqrt(x), x2,etc.).
  * Aggregate features into promising new features.
    
#
*  **4. Feature scaling:**
  * Standardize or normalize features
  #

## **1. Data Cleaning**

In [1]:
import numpy as np

# clean function remove outliers on request and do imputation for the missing values
def clean_data(dataframe, outliers=True, fill_missing='mean'):
    """
    Perform data cleaning on the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): The DataFrame to clean.
        outliers (bool): Whether to perform outlier detection and removal using IQR.
        fill_missing (str or numeric): Method to fill missing values ('zero', 'mean', 'median', 'drop'),
                                        or a numeric value for custom imputation.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    try:
        cleaned_df = dataframe.copy()  # Create a copy to keep the original data
        
        # Detect the outliers and remove them for numeric columns
        numeric_columns = cleaned_df.select_dtypes(include=[np.number]).columns
        if outliers:
            Q1 = cleaned_df[numeric_columns].quantile(0.25)
            Q3 = cleaned_df[numeric_columns].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            cleaned_df[numeric_columns] = cleaned_df[numeric_columns].applymap(
                lambda x: x if (x >= lower_bound[x.name]) and (x <= upper_bound[x.name]) else np.nan
            )

        # Impute missing values in categorical columns with the most frequent value
        categorical_columns = cleaned_df.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            most_frequent_value = cleaned_df[col].mode()[0]
            cleaned_df[col].fillna(most_frequent_value, inplace=True)

        # Impute missing values in numeric columns using the selected strategy
        # Map imputation strategies to corresponding functions
        
        IMPUTE_STRATEGIES = {
        'zero': lambda df, col: df[col].fillna(0),
        'mean': lambda df, col: df[col].fillna(df[col].mean()),
        'median': lambda df, col: df[col].fillna(df[col].median())}

        # drop missing values 
        if fill_missing == 'drop':
            cleaned_df.dropna(inplace=True)

        # impute with number 
        elif isinstance(fill_missing, (int, float)):
            cleaned_df[numeric_columns] = cleaned_df[numeric_columns].fillna(fill_missing)

        # impute with mean, median, zero
        elif fill_missing in IMPUTE_STRATEGIES:
            for col in numeric_columns:
                cleaned_df[col] = IMPUTE_STRATEGIES[fill_missing](cleaned_df, col)
        else:
            raise ValueError("Wrong, missing imputation strategy! Please enter one of these options: ('zero', 'mean', 'median', 'drop', number)")

    except Exception as e:
        print("An error occurred:", e)
        return None

    return cleaned_df


d:\conda\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
d:\conda\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


## **2,3 Feature Selection and Engineering - Optional** 

In [None]:
# drop or add new columns on need 

## **4. Feature Scaling and encoding - preprocessing**


In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import joblib

def preprocess_data(df, save_pipeline=False, pipeline_filename='preprocessing_pipeline.pkl'):
    """
    Preprocesses a DataFrame by performing one-hot encoding on categorical columns
    and scaling numerical columns.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame containing the data.
        save_pipeline (bool): Whether to save the preprocessing pipeline to a file.
        pipeline_filename (str): Filename to save the preprocessing pipeline (if save_pipeline is True).
        
    Returns:
        pd.DataFrame: A new DataFrame with the categorical columns one-hot encoded
                    and the numerical columns scaled, or None if an error occurs.
    """
    try:
        preprocessed_df = df.copy()  # Create a copy to keep the original data

        # separate categorical and numerical columns
        categorical_columns = preprocessed_df.select_dtypes(include=['object']).columns
        numeric_columns = preprocessed_df.select_dtypes(include=[np.number]).columns

        # Define the transformers for categorical and numerical columns
        transformers = [
            ('categorical', OneHotEncoder(), categorical_columns),
            #('categorical', OrdinalEncoder(), categorical_columns), # we don't use ordinal encoder as the categorical column not in order 
            ('numerical', StandardScaler(), numeric_columns)
            #('numerical', MinMaxScaler(), numeric_columns) # we use this if we are sure there are no oulliers 
        ]

        # Create a column transformer to apply transformers to the respective columns
        ct = ColumnTransformer(transformers, remainder='passthrough')

        # Create a pipeline with the column transformer
        pipeline = Pipeline(steps=[('preprocessor', ct)])

        # Fit and transform the data using the pipeline
        processed_data = pipeline.fit_transform(df)

        # Convert the processed data array back to a DataFrame
        processed_df = pd.DataFrame(processed_data, columns=ct.get_feature_names_out(input_features=df.columns))
        
        # Save the pipeline if requested
        if save_pipeline:
            joblib.dump(pipeline, pipeline_filename)
            print("Pipeline saved as", pipeline_filename)
        
        return processed_df
    
    except Exception as e:
        print("An error occurred during preprocessing:", e)
        return None

