In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from joblib import Parallel, delayed

# Custom Imputer for handling missing values with specified strategies
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy_dict=None):
        self.strategy_dict = strategy_dict or {}

    def fit(self, X, y=None):
        self.impute_models = {}
        for column, strategy in self.strategy_dict.items():
            if strategy in ['mean', 'median', 'most_frequent']:
                self.impute_models[column] = SimpleImputer(strategy=strategy)
                self.impute_models[column].fit(X[[column]])
        return self

    def transform(self, X):
        for column, model in self.impute_models.items():
            X[column] = model.transform(X[[column]])
        return X

# OutlierHandler for detecting and handling outliers
class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.01):
        self.contamination = contamination

    def fit(self, X, y=None):
        self.model = IsolationForest(contamination=self.contamination)
        self.model.fit(X)
        return self

    def transform(self, X):
        outliers = self.model.predict(X) == -1
        return X[~outliers]

# Data visualization function for missing values
def plot_missing_values(df):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.show()

# Main cleaning function
def clean_dataset(file_path, cleaned_file_path, strategy_dict=None, remove_outliers=True):
    df = pd.read_csv(file_path)
    plot_missing_values(df)

    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    numeric_transformer = Pipeline(steps=[
        ('imputer', CustomImputer(strategy_dict=strategy_dict))
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols),
        ]
    )

    cleaning_steps = [('preprocessor', preprocessor)]

    if remove_outliers:
        cleaning_steps.append(('outliers', OutlierHandler()))

    cleaning_pipeline = Pipeline(steps=cleaning_steps)
    df_cleaned = cleaning_pipeline.fit_transform(df)
    df_cleaned = pd.DataFrame(df_cleaned, columns=numeric_cols.append(categorical_cols))  # Adjust as needed based on OneHotEncoder output

    df_cleaned.to_csv(cleaned_file_path, index=False)
    print("Dataset cleaned and saved to:", cleaned_file_path)

# Example usage
if __name__ == "__main__":
    strategy_dict = {'Age': 'mean', 'Salary': 'median'}
    clean_dataset("path/to/your/dataset.csv", "path/to/your/cleaned_dataset.csv", strategy_dict=strategy_dict, remove_outliers=True)


Notes for Deployment:

Customization Required: Before deployment, review and customize the clean_dataset function, especially the handling of numeric and categorical columns, to match your dataset's specifics.

Strategy Dictionary: The strategy_dict parameter in CustomImputer allows specifying different imputation strategies for different columns. Adjust this as needed based on your dataset's characteristics.

Outlier Removal: The OutlierHandler uses Isolation Forest for outlier detection and removal. The contamination parameter can be adjusted based on the expected proportion of outliers in your dataset.

Data Visualization: The plot_missing_values function provides a visual overview of missing data. This step can be skipped or enhanced with additional visualizations for a more in-depth analysis.

Parallel Processing: While the code is designed for efficiency, further optimizations using parallel processing (e.g., with joblib) could be explored for handling very large datasets.