In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

In [None]:

class DataFrameTransform: # DataFrameTransform Class, with the outlier removal functions added as Class instance methods. 
    def __init__(self, df):
        self.df = df

    @staticmethod
    def impute_mean(df, column_name):
        if column_name in df.columns and df[column_name].dtype in [np.float64, np.int64]:
            mean_value = df[column_name].mean()
            df = df.copy()
            df.loc[df[column_name].isna(), column_name] = mean_value
            print(f"Imputed mean value {mean_value:.2f} into null values of '{column_name}'")
        return df

    @staticmethod
    def impute_median(df, column_name):
        if column_name in df.columns and df[column_name].dtype in [np.float64, np.int64]:
            median_value = df[column_name].median()
            df = df.copy()
            df.loc[df[column_name].isna(), column_name] = median_value
            print(f"Imputed median value {median_value:.2f} into null values of '{column_name}'")
        return df

    @staticmethod
    def impute_mode(df, column_name):
        if column_name in df.columns:
            mode_value = df[column_name].mode()
            if not mode_value.empty:
                df = df.copy()
                df.loc[df[column_name].isna(), column_name] = mode_value[0]
                print(f"Imputed mode value '{mode_value[0]}' into null values of '{column_name}'")
        return df

    def log_tf(self, column_name):
        if column_name in self.df.columns and (self.df[column_name] >= 0).all():
            self.df[column_name] = self.df[column_name].map(lambda i: np.log(i + 1))
            print(f"Applied log transformation to '{column_name}'.")
            sns.histplot(self.df[column_name], kde=True)
            plt.title(f"Log Transform of {column_name}")
            plt.xlabel(column_name)
            plt.ylabel("Frequency")
            plt.show()
        else:
            print(f"Cannot apply log transformation to '{column_name}'. Ensure there are no negative values.")

    def sqrt_tf(self, column_name):
        if column_name in self.df.columns and (self.df[column_name] >= 0).all():
            self.df[column_name] = self.df[column_name].map(lambda i: np.sqrt(i))
            print(f"Applied square root transformation to '{column_name}'.")
            sns.histplot(self.df[column_name], kde=True)
            plt.title(f"Square Root Transform of {column_name}")
            plt.xlabel(column_name)
            plt.ylabel("Frequency")
            plt.show()
        else:
            print(f"Cannot apply square root transformation to '{column_name}'. Ensure all values are non-negative.")

    def bxcx_tf(self, column_name):
        if column_name in self.df.columns and (self.df[column_name] > 0).all():
            transformed, _ = stats.boxcox(self.df[column_name])
            self.df[column_name] = transformed
            print(f"Applied Box-Cox transformation to '{column_name}'.")
            sns.histplot(self.df[column_name], kde=True)
            plt.title(f"Box-Cox Transform of {column_name}")
            plt.xlabel(column_name)
            plt.ylabel("Frequency")
            plt.show()
        else:
            print(f"Cannot apply Box-Cox transformation to '{column_name}'. Ensure all values are positive.")

    def yeoj_tf(self, column_name):
        if column_name in self.df.columns:
            transformed, _ = stats.yeojohnson(self.df[column_name])
            self.df[column_name] = transformed
            print(f"Applied Yeo-Johnson transformation to '{column_name}'.")
            sns.histplot(self.df[column_name], kde=True)
            plt.title(f"Yeo-Johnson Transform of {column_name}")
            plt.xlabel(column_name)
            plt.ylabel("Frequency")
            plt.show()
        else:
            print(f"Cannot apply Yeo-Johnson transformation to '{column_name}'.")

    def rem_num_out(self): # defined as a class method to directly modify the dataframe 
        numeric_mask = pd.Series(True, index=self.df.index) # creating a mask for the datafram to later use to filter.
        for col in self.df.select_dtypes(include=[np.number]).columns: # for each column in the df, only values, with dtype numeric are selcted. 
            Q1 = self.df[col].quantile(0.25) # using inbuilt quantile function to select the 0.25 range in each col.
            Q3 = self.df[col].quantile(0.75) # same as above but with 0.75
            IQR = Q3 - Q1 # produce the IQR of each columns values. 
            lower_bound = Q1 - 1.5 * IQR # lower threshold for outliers.
            upper_bound = Q3 + 1.5 * IQR # upper threshold for outliers. 
            numeric_mask &= (self.df[col] >= lower_bound) & (self.df[col] <= upper_bound) # updating the mask to apply the lower and upper thresholds on df 
        self.df = self.df[numeric_mask] # applying the mask 
        print("Removed numeric outliers.") # confirmation message. 

    def rem_dtme_out(self):
        datetime_mask = pd.Series(True, index=self.df.index)
        for col in self.df.select_dtypes(include=["datetime64[ns]"]).columns: # samr as numeric but using datetime dtype. 
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - pd.Timedelta(days=30 * 1.5 * (IQR / pd.Timedelta(days=30))) # using timedelta to create ~ month unit of datatime. 
            upper_bound = Q3 + pd.Timedelta(days=30 * 1.5 * (IQR / pd.Timedelta(days=30))) # the same as above but for uper threshold. 
            datetime_mask &= (self.df[col] >= lower_bound) & (self.df[col] <= upper_bound) # updating mask to filter out the lower and upper threshold. 
        self.df = self.df[datetime_mask] # applying the mask. 
        print("Removed datetime outliers using ~months (30 days) as the unit.") # confirmation message. 


    def get_dataframe(self):
        return self.df # return the df, for use after the data is filted. 


Have left notes, on the code, but here is explanation in more detail. Class methods are used rather than @static function, as I want the df to be directly modified in a Class instance. Masks are created, turning the df into a bool series. In the parameters, 'True' dictates that all values are set to True. Setting the index to match the df index, means that each column will have the same index, rather than continuing with each column which would complicate the code. 

In rem_num_out when selecting for dtype numpty number (np.number) to select for all numeric. Whereas in rem_dtme_out just the pandas 'datetime64' with nanosecond precison is used. Quantiles, and determining of IQR are self explanatory. I used 1.5 IQR as the definition of outliers for both functions. In terms of using 30 days for datetime, I thought this was more appropiate than using days / seconds, as all datetime values in df, are in the format of years and months. As months are all unequal 30 days was used an an aproxamate. After using the &= opertor, both the lower bound and upper bound thresholds applied to the columns and applied to the mask. Mask then placed on df.