In [None]:
class UnmovedFlowV2:
    def __init__(self):
        """
        Initialize attributes.
        """
        self.problem_type = None
        self.target = None
        self.full_dataframe = None
        self.model1 = None
        self.model2 = None
        self.model3 = None
        self.Ensemble1 = None
        self.Ensemble2 = None
        self.random_state = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

        """
        Imports necessary libraries globally.
        """
        # Basic manipulation
        global pd, np
        import pandas as pd
        import numpy as np

        # Visualization
        global sns, plt, px, go
        import seaborn as sns
        import matplotlib.pyplot as plt
        import plotly.express as px
        import plotly.graph_objects as go

        # Creating sample data
        global make_classification, make_regression
        from sklearn.datasets import make_classification, make_regression

        # Splitting data into training and testing sets
        global train_test_split
        from sklearn.model_selection import train_test_split

        # Imputing missing values
        global SimpleImputer, KNNImputer
        from sklearn.impute import SimpleImputer, KNNImputer

        # Feature Extraction / Dimensionality reduction
        global PCA, TSNE, umap
        from sklearn.decomposition import PCA
        from sklearn.manifold import TSNE
        import umap

        # Scaling
        global StandardScaler, MinMaxScaler
        from sklearn.preprocessing import StandardScaler, MinMaxScaler

        # Models
        global LogisticRegression, LinearRegression
        global RandomForestClassifier, RandomForestRegressor
        global XGBClassifier, XGBRegressor
        global VotingClassifier, VotingRegressor
        global StackingClassifier, StackingRegressor
        from sklearn.linear_model import LogisticRegression, LinearRegression
        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
        from xgboost import XGBClassifier, XGBRegressor
        from sklearn.ensemble import VotingClassifier, VotingRegressor
        from sklearn.ensemble import StackingClassifier, StackingRegressor

        # One hot encoding and target encoding and enable any needed things
        global OneHotEncoder, TargetEncoder
        from category_encoders import TargetEncoder
        from sklearn.preprocessing import OneHotEncoder

        # Transforms
        global PowerTransformer, FunctionTransformer
        from sklearn.preprocessing import PowerTransformer
        from sklearn.preprocessing import FunctionTransformer

        # Outlier removal
        global iqr, zscore, IsolationForest, LocalOutlierFactor
        from scipy.stats import iqr, zscore
        from sklearn.ensemble import IsolationForest
        from sklearn.neighbors import LocalOutlierFactor

        # Cross-validation
        global cross_val_score
        from sklearn.model_selection import cross_val_score

        # Finding optimal hyperparameters
        global GridSearchCV
        from sklearn.model_selection import GridSearchCV

        # Evaluation metrics (Regression)
        global mean_squared_error, mean_absolute_error, r2_score
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

        # Evaluation metrics (Classification) including ones needed for the roc curve and info and plotting the roc curve
        global accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, classification_report

        # Hide warnings
        global warnings
        import warnings
        warnings.filterwarnings('ignore')


    # Make a function that splits the full dataframe into training and testing data, and updating the training and testing data attributes
    def split_data(self, test_size=0.2, random_state=None, stratify=None, target=None):
        """
        Split the full dataframe into training and testing data.
    
        Parameters:
        - test_size: The proportion of the dataset to include in the test split.
        - random_state: The seed used by the random number generator. this should use the attribute of the object.
        - stratify: If not None, data is split in a stratified fashion, using this as the class labels.
        - target: The target variable to stratify by.
        """
        # If a random state is not provided, use the one from the object, which is itself None by default
        if random_state is None:
            random_state = self.random_state

        # If the target variable is not provided, use the one from the object, which is itself None by default
        if target is None:
            target = self.target

        # If stratify is None, split the data without stratification
        if stratify is None:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.full_dataframe.drop(target, axis=1), self.full_dataframe[target], test_size=test_size, random_state=random_state)
        # Otherwise, split the data with stratification
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.full_dataframe.drop(target, axis=1), self.full_dataframe[target], test_size=test_size, random_state=random_state, stratify=self.full_dataframe[stratify])

        # Print message
        print("Data split successful into attributes X_train, X_test, y_train, and y_test")

    # Make a function that generates some sample data using make_classification or make_regression while allowing the user to set the common parameters for those functions, and be sure to update the full dataframe attribute
    def use_sample_data(self, n_samples=1000, n_features=10, n_informative=5, n_redundant=2, random_state=None):
        """
        Generate sample data using make_classification or make_regression.
    
        Parameters:
        - n_samples: The number of samples to generate.
        - n_features: The number of features to generate.
        - n_informative: The number of informative features to generate.
        - n_redundant: The number of redundant features to generate.
        - n_targets: The number of target variables to generate.
        - random_state: The random state to use for reproducibility.
        """
        # If a random state is not provided, use the one from the object, which is itself None by default
        if random_state is None:
            random_state = self.random_state

        # If a problem type is not provided, use classification by default
        if self.problem_type is None:
            self.problem_type = 'classification'
            print("Note: Problem type not provided, defaulting to classification (to specify, set classification or regression in the problem_type attribute)")

        # If regression is True, generate regression data, otherwise generate classification data
        if self.problem_type == 'regression':
            X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state)
        else:
            X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, random_state=random_state)

        # Turn this into a dataframe while setting the object's full_dataframe and target attributes
        self.full_dataframe = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(1, n_features+1)])
        self.full_dataframe['target'] = y
        self.target = 'target'

    # Define a function to plot the correlation heatmap of the full dataframe and the training and testing data if they exist
    def relationship_heatmap(self, type='correlation', annot=True, cmap='seismic', center=0, figsize_x=10, figsize_y=8):
        """
        Display a heatmap of the correlation matrix, ensure each square shows a number.
    
        Parameters:
        - type: The type of relationship to plot, either 'correlation' or 'covariance'.
        - annot: If True, write the data value in each cell.
        - cmap: The mapping from data values to color space.
        - center: The value at which to center the colormap when plotting divergent data.
        - figsize_x: Width of the figure.
        - figsize_y: Height of the figure.
        """
        # If the full dataframe is not none plot the correlation heatmap for the full dataframe
        if self.full_dataframe is not None:
            if type == 'correlation':
                print("Correlation Heatmap for the full dataframe")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.full_dataframe.corr(), annot=annot, cmap=cmap, center=center)
                plt.show()
            elif type == 'covariance':
                print("Covariance Heatmap for the full dataframe")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.full_dataframe.cov(), annot=annot, cmap=cmap, center=center)
                plt.show()

        # If X_train is not none plot the correlation heatmap for X_train
        if self.X_train is not None:
            if type == 'correlation':
                print("Correlation Heatmap for X_train")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.X_train.corr(), annot=annot, cmap=cmap, center=center)
                plt.show()
            elif type == 'covariance':
                print("Covariance Heatmap for X_train")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.X_train.cov(), annot=annot, cmap=cmap, center=center)
                plt.show()

        # If X_test is not none plot the correlation heatmap for X_test
        if self.X_test is not None:
            if type == 'correlation':
                print("Correlation Heatmap for X_test")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.X_test.corr(), annot=annot, cmap=cmap, center=center)
                plt.show()
            elif type == 'covariance':
                print("Covariance Heatmap for X_test")
                plt.figure(figsize=(figsize_x, figsize_y))
                sns.heatmap(self.X_test.cov(), annot=annot, cmap=cmap, center=center)
                plt.show()

    # Define a function for feature extraction / dimensionality reduction
    def extract(self, method='PCA', n_components=2, custom_params={}, exclude_columns=[]):
        """
        Extract the features of / reduce the dimensionality of the data.
    
        Parameters:
        - method: The extraction / dimensionality reduction method to use, either 'PCA', 'TSNE', or 'UMAP'.
        - n_components: The number of components to reduce the data to.
        - custom_params: A dictionary of custom parameters to use.
        - exclude_columns: A list of columns to exclude.
        """

        # Create a PCA, TSNE, and UMAP object
        pca_object = PCA(n_components=n_components, **custom_params)
        tsne_object = TSNE(n_components=n_components, **custom_params)
        umap_object = umap.UMAP(n_components=n_components, **custom_params)

        # Make a temp copy of the target column if it exists
        if self.target is not None:
            temp_target = self.full_dataframe[self.target].copy()

        # If the full dataframe is not none, extract features / reduce dimensionality
        if self.full_dataframe is not None:
            if self.target is None:
                if method == 'PCA':
                    self.full_dataframe = pd.DataFrame(pca_object.fit_transform(self.full_dataframe.drop(exclude_columns, axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])
                elif method == 'TSNE':
                    self.full_dataframe = pd.DataFrame(tsne_object.fit_transform(self.full_dataframe.drop(exclude_columns, axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])
                elif method == 'UMAP':
                    self.full_dataframe = pd.DataFrame(umap_object.fit_transform(self.full_dataframe.drop(exclude_columns, axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])
            else:
                if method == 'PCA':
                    self.full_dataframe = pd.DataFrame(pca_object.fit_transform(self.full_dataframe.drop(exclude_columns+[self.target], axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])
                elif method == 'TSNE':
                    self.full_dataframe = pd.DataFrame(tsne_object.fit_transform(self.full_dataframe.drop(exclude_columns+[self.target], axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])
                elif method == 'UMAP':
                    self.full_dataframe = pd.DataFrame(umap_object.fit_transform(self.full_dataframe.drop(exclude_columns+[self.target], axis=1)), columns=[f"component_{i}" for i in range(1, n_components+1)])

                # Re-attach the target column
                self.full_dataframe[self.target] = temp_target

        # If X_train is not none, extract features / reduce dimensionality
        if self.X_train is not None:
            if method == 'PCA':
                self.X_train = pd.DataFrame(pca_object.fit_transform(self.X_train), columns=[f"component_{i}" for i in range(1, n_components+1)])
            elif method == 'TSNE':
                self.X_train = pd.DataFrame(tsne_object.fit_transform(self.X_train), columns=[f"component_{i}" for i in range(1, n_components+1)])
            elif method == 'UMAP':
                self.X_train = pd.DataFrame(umap_object.fit_transform(self.X_train), columns=[f"component_{i}" for i in range(1, n_components+1)])

        # If X_test is not none, extract features / reduce dimensionality
        if self.X_test is not None:
            if method == 'PCA':
                self.X_test = pd.DataFrame(pca_object.transform(self.X_test), columns=[f"component_{i}" for i in range(1, n_components+1)])
            elif method == 'TSNE':
                self.X_test = pd.DataFrame(tsne_object.transform(self.X_test), columns=[f"component_{i}" for i in range(1, n_components+1)])
            elif method == 'UMAP':
                self.X_test = pd.DataFrame(umap_object.transform(self.X_test), columns=[f"component_{i}" for i in range(1, n_components+1)])
    
    # Define a function for previewing PCA, TSNE, and UMAP info
    def preview(self, method='PCA'):
        """
        Preview the explained variances and 3D plots of the data.
    
        Parameters:
        - method: The method to use, either 'PCA', 'TSNE', or 'UMAP'.
        """
        # If the method id PCA, first plot the cumulative explained variances of the existing features over time, then into a preview of what a copy of the data reduced to 3D would look like when colored against the target if applicable
        if method == 'PCA':
            if self.full_dataframe is not None:
                if self.target is None:
                    pca_object = PCA()
                    pca_object.fit(self.full_dataframe)
                    explained_variances = pca_object.explained_variance_ratio_
                    cumulative_explained_variances = np.cumsum(explained_variances)
                    fig = go.Figure()
                    fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(explained_variances)+1)], y=explained_variances, mode='lines+markers', name='Explained Variance'))
                    fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(cumulative_explained_variances)+1)], y=cumulative_explained_variances, mode='lines+markers', name='Cumulative Explained Variance'))
                    fig.update_layout(title='Explained Variances of the Components For the Full Dataframe', xaxis_title='Component', yaxis_title='Explained Variance')
                    fig.show()
                else:
                    pca_object = PCA()
                    pca_object.fit(self.full_dataframe.drop(self.target, axis=1))
                    explained_variances = pca_object.explained_variance_ratio_
                    cumulative_explained_variances = np.cumsum(explained_variances)
                    fig = go.Figure()
                    fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(explained_variances)+1)], y=explained_variances, mode='lines+markers', name='Explained Variance'))
                    fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(cumulative_explained_variances)+1)], y=cumulative_explained_variances, mode='lines+markers', name='Cumulative Explained Variance'))
                    fig.update_layout(title='Explained Variances of the Components', xaxis_title='Component', yaxis_title='Explained Variance')
                    fig.show()
            
            # Now also for the X_train and X_test data if they exist
            if self.X_train is not None:
                pca_object = PCA()
                pca_object.fit(self.X_train)
                explained_variances = pca_object.explained_variance_ratio_
                cumulative_explained_variances = np.cumsum(explained_variances)
                fig = go.Figure()
                fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(explained_variances)+1)], y=explained_variances, mode='lines+markers', name='Explained Variance'))
                fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(cumulative_explained_variances)+1)], y=cumulative_explained_variances, mode='lines+markers', name='Cumulative Explained Variance'))
                fig.update_layout(title='Explained Variances of the Components For X_train', xaxis_title='Component', yaxis_title='Explained Variance')
                fig.show()
                
            if self.X_test is not None:
                pca_object = PCA()
                pca_object.fit(self.X_test)
                explained_variances = pca_object.explained_variance_ratio_
                cumulative_explained_variances = np.cumsum(explained_variances)
                fig = go.Figure()
                fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(explained_variances)+1)], y=explained_variances, mode='lines+markers', name='Explained Variance'))
                fig.add_trace(go.Scatter(x=[f"component_{i}" for i in range(1, len(cumulative_explained_variances)+1)], y=cumulative_explained_variances, mode='lines+markers', name='Cumulative Explained Variance'))
                fig.update_layout(title='Explained Variances of the Components For X_test', xaxis_title='Component', yaxis_title='Explained Variance')
                fig.show() 
                
            # Now for the 3D part of it
            if self.full_dataframe is not None:
                if self.target is not None:
                    pca_object = PCA(n_components=3)
                    pca_data = pca_object.fit_transform(self.full_dataframe.drop(self.target, axis=1))
                    pca_data = pd.DataFrame(pca_data, columns=['component_1', 'component_2', 'component_3'])
                    pca_data['target'] = self.full_dataframe[self.target]
                    fig = px.scatter_3d(pca_data, x='component_1', y='component_2', z='component_3', color='target', title='3D PCA Plot of the Full Dataframe')
                    fig.show()
                else:
                    pca_object = PCA(n_components=3)
                    pca_data = pca_object.fit_transform(self.full_dataframe)
                    pca_data = pd.DataFrame(pca_data, columns=['component_1', 'component_2', 'component_3'])
                    fig = px.scatter_3d(pca_data, x='component_1', y='component_2', z='component_3', title='3D PCA Plot of the Full Dataframe')
                    fig.show()
                    
            # Now also for the X_train and X_test data if they exist, while using y_train for the target coloring, make sure everything is properly aligned so I dont get grey null values on the plot
            if self.X_train is not None:
                pca_object = PCA(n_components=3)
                pca_data = pca_object.fit_transform(self.X_train)
                pca_data = pd.DataFrame(pca_data, columns=['component_1', 'component_2', 'component_3'])
                pca_data['target'] = self.y_train
                fig = px.scatter_3d(pca_data, x='component_1', y='component_2', z='component_3', color='target', title='3D PCA Plot of X_train')
                fig.show()
                
            if self.X_test is not None:
                pca_object = PCA(n_components=3)
                pca_data = pca_object.fit_transform(self.X_test)
                pca_data = pd.DataFrame(pca_data, columns=['component_1', 'component_2', 'component_3'])
                pca_data['target'] = self.y_test
                fig = px.scatter_3d(pca_data, x='component_1', y='component_2', z='component_3', color='target', title='3D PCA Plot of X_test')
                fig.show()
                
    # Define a function for encoding with different options
    def encode(self, method='onehot', columns=[], custom_params={}):
        """
        Encode the data using one-hot or target encoding.
    
        Parameters:
        - method: The encoding method to use, either 'onehot' or 'target'.
        - columns: A list of columns to encode.
        - custom_params: A dictionary of custom parameters to use when encoding the data.
        """
        # Create a one-hot and target encoder
        onehot_encoder = OneHotEncoder(**custom_params)
        target_encoder = TargetEncoder(**custom_params)

        # Save a list of original column names for later use
        temp_original_columns = self.full_dataframe.columns

        # If the full dataframe is not none, encode it
        if self.full_dataframe is not None:
            if method == 'onehot':
                self.full_dataframe = pd.get_dummies(self.full_dataframe, columns=columns)
            elif method == 'target':
                self.full_dataframe.loc[:, columns] = target_encoder.fit_transform(self.full_dataframe[columns], self.full_dataframe[self.target])

        # If X_train is not none, encode it
        if self.X_train is not None:
            if method == 'onehot':
                self.X_train = pd.get_dummies(self.X_train, columns=columns)
            elif method == 'target':
                self.X_train.loc[:, columns] = target_encoder.fit_transform(self.X_train[columns], self.y_train)

        # If X_test is not none, encode it
        if self.X_test is not None:
            if method == 'onehot':
                self.X_test = pd.get_dummies(self.X_test, columns=columns)
            elif method == 'target':
                self.X_test.loc[:, columns] = target_encoder.transform(self.X_test[columns])

        # Ensure if onehot is used that the new columns are integers.
        if method == 'onehot':
            temp_new_columns = self.full_dataframe.columns
            temp_new_columns = [x for x in temp_new_columns if x not in temp_original_columns]
            for col in temp_new_columns:
                self.full_dataframe[col] = self.full_dataframe[col].astype(int)
            if self.X_train is not None:
                temp_new_columns = self.X_train.columns
                temp_new_columns = [x for x in temp_new_columns if x not in temp_original_columns]
                for col in temp_new_columns:
                    self.X_train[col] = self.X_train[col].astype(int)
            if self.X_test is not None:
                temp_new_columns = self.X_test.columns
                temp_new_columns = [x for x in temp_new_columns if x not in temp_original_columns]
                for col in temp_new_columns:
                    self.X_test[col] = self.X_test[col].astype(int)

        # Ensure if onehot is used that the target column is moved back to the last column in the full_dataframe ahead of the new columns
        if method == 'onehot':
            if self.full_dataframe is not None:
                if self.target is not None:
                    temp_target = self.full_dataframe[self.target]
                    self.full_dataframe.drop(self.target, axis=1, inplace=True)
                    self.full_dataframe[self.target] = temp_target

    # Define a function to check the dtypes of the full dataframe and the training and testing data if they exist
    def dtypes(self):
        """
        Check the data types of the full dataframe and the training and testing data if they exist.
        """
        # If the full dataframe is not none, print its data types
        if self.full_dataframe is not None:
            print("Data Types of the Full Dataframe:")
            print(self.full_dataframe.dtypes)

        # If X_train is not none, print its data types
        if self.X_train is not None:
            print("\nData Types of X_train:")
            print(self.X_train.dtypes)

        # If X_test is not none, print its data types
        if self.X_test is not None:
            print("\nData Types of X_test:")
            print(self.X_test.dtypes)

        # If y_train is not none, print its data types
        if self.y_train is not None:
            print("\nData Types of y_train:")
            print(self.y_train.dtypes)

        # If y_test is not none, print its data types
        if self.y_test is not None:
            print("\nData Types of y_test:")
            print(self.y_test.dtypes)

    # Define a function to check the shapes of the full dataframe and the training and testing data if they exist
    def shapes(self):
        """
        Check the shapes of the full dataframe and the training and testing data if they exist.
        """
        # If the full dataframe is not none, print its shape
        if self.full_dataframe is not None:
            print(f"Shape of the Full Dataframe: {self.full_dataframe.shape[0]} rows, {self.full_dataframe.shape[1]} columns")

        # If X_train is not none, print its shape
        if self.X_train is not None:
            print(f"Shape of X_train: {self.X_train.shape[0]} rows, {self.X_train.shape[1]} columns")

        # If X_test is not none, print its shape
        if self.X_test is not None:
            print(f"Shape of X_test: {self.X_test.shape[0]} rows, {self.X_test.shape[1]} columns")

        # If y_train is not none, print its shape
        if self.y_train is not None:
            print(f"Shape of y_train: {self.y_train.shape[0]} rows, 1 column")

        # If y_test is not none, print its shape
        if self.y_test is not None:
            print(f"Shape of y_test: {self.y_test.shape[0]} rows, 1 column")

    # Define a function to make a scaled version of the full dataframe and the training and testing data if they exist
    def scale(self, scaler="standard", exclude_columns=[]):
        """
        Scale the data using either standard or minmax scaling.
    
        Parameters:
        - scaler: The type of scaling to use, either "standard" or "minmax".
        - exclude_columns: A list of columns to exclude from scaling.
        """

        # Create a standard and minmax scaler
        standard_scaler = StandardScaler()
        minmax_scaler = MinMaxScaler()

        # If the full dataframe is not none, scale it, but don't scale the columns in exclude_columns 
        if self.full_dataframe is not None:
            if self.target is None:
                if scaler == "standard":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)] = standard_scaler.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)])
                elif scaler == "minmax":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)] = minmax_scaler.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)])
            else:
                if scaler == "standard":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])] = standard_scaler.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])])
                elif scaler == "minmax":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])] = minmax_scaler.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])])


        # If X_train is not none, scale it, but don't scale the columns in exclude_columns
        if self.X_train is not None:
            if scaler == "standard":
                self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)] = standard_scaler.fit_transform(self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)])
            elif scaler == "minmax":
                self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)] = minmax_scaler.fit_transform(self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)])

        # If X_test is not none, scale it, but don't scale the columns in exclude_columns 
        if self.X_test is not None:
            if scaler == "standard":
                self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)] = standard_scaler.transform(self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)])
            elif scaler == "minmax":
                self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)] = minmax_scaler.transform(self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)])

    # Define a function to impute missing values in the full dataframe and the training and testing data if they exist
    def impute(self, imputer="simple", exclude_columns=[]):
        """
        Impute missing values using either simple or KNN imputation.
    
        Parameters:
        - imputer: The type of imputation to use, either "simple" or "knn".
        - exclude_columns: A list of columns to exclude from imputation.
        """

        # Create a simple and KNN imputer
        simple_imputer = SimpleImputer()
        knn_imputer = KNNImputer()

        # If the full dataframe is not none, impute it, but don't impute the columns in exclude_columns (however make sure they are still in the dataframe)
        if self.full_dataframe is not None:
            if self.target is None:
                if imputer == "simple":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)] = simple_imputer.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)])
                elif imputer == "knn":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)] = knn_imputer.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns)])
            else:
                if imputer == "simple":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])] = simple_imputer.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])])
                elif imputer == "knn":
                    self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])] = knn_imputer.fit_transform(self.full_dataframe.loc[:, self.full_dataframe.columns.difference(exclude_columns+[self.target])])

        # If X_train is not none, impute it, but don't impute the columns in exclude_columns (however make sure they are still in the dataframe)
        if self.X_train is not None:
            if imputer == "simple":
                self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)] = simple_imputer.fit_transform(self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)])
            elif imputer == "knn":
                self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)] = knn_imputer.fit_transform(self.X_train.loc[:, self.X_train.columns.difference(exclude_columns)])

        # If X_test is not none, impute it, but don't impute the columns in exclude_columns (however make sure they are still in the dataframe)
        if self.X_test is not None:
            if imputer == "simple":
                self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)] = simple_imputer.transform(self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)])
            elif imputer == "knn":
                self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)] = knn_imputer.transform(self.X_test.loc[:, self.X_test.columns.difference(exclude_columns)])

    # Define a function to fit model 1
    def fit_model_1(self, custom_params={}):
        """
        Fit a first model to the training data.
    
        Parameters:
        - custom_params: A dictionary of custom parameters to use when fitting the model.
        """
        if self.problem_type == 'classification':
            self.model1 = LogisticRegression(**custom_params)
        else:
            self.model1 = LinearRegression(**custom_params)
        self.model1.fit(self.X_train, self.y_train)

    # Define a function to fit model 2
    def fit_model_2(self, custom_params={}):
        """
        Fit a second model to the training data.
    
        Parameters:
        - custom_params: A dictionary of custom parameters to use when fitting the model.
        """
        if self.problem_type == 'classification':
            self.model2 = RandomForestClassifier(**custom_params)
        else:
            self.model2 = RandomForestRegressor(**custom_params)
        self.model2.fit(self.X_train, self.y_train)

    # Define a function to fit model 3
    def fit_model_3(self, custom_params={}):
        """
        Fit a third model to the training data.
    
        Parameters:
        - custom_params: A dictionary of custom parameters to use when fitting the model.
        """
        if self.problem_type == 'classification':
            self.model3 = XGBClassifier(**custom_params)
        else:
            self.model3 = XGBRegressor(**custom_params)
        self.model3.fit(self.X_train, self.y_train)

    # Define a function to fit model 1 but using cv, but not grid search
    def fit_model_1_cv(self, cv=5, scoring='accuracy'):
        """
        Fit a first model to the training data using cross-validation.
    
        Parameters:
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for cross-validation.
        """
        scores = cross_val_score(self.model1, self.X_train, self.y_train, cv=cv, scoring=scoring)
        print(f"Cross-Validation Scores: {scores}")
        print(f"Mean Cross-Validation Score: {scores.mean()}")

    # Define a function to fit model 2 but using cv, but not grid search
    def fit_model_2_cv(self, cv=5, scoring='accuracy'):
        """
        Fit a second model to the training data using cross-validation.
    
        Parameters:
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for cross-validation.
        """
        scores = cross_val_score(self.model2, self.X_train, self.y_train, cv=cv, scoring=scoring)
        print(f"Cross-Validation Scores: {scores}")
        print(f"Mean Cross-Validation Score: {scores.mean()}")

    # Define a function to fit model 3 but using cv, but not grid search
    def fit_model_3_cv(self, cv=5, scoring='accuracy'):
        """
        Fit a third model to the training data using cross-validation.
    
        Parameters:
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for cross-validation.
        """
        scores = cross_val_score(self.model3, self.X_train, self.y_train, cv=cv, scoring=scoring)
        print(f"Cross-Validation Scores: {scores}")
        print(f"Mean Cross-Validation Score: {scores.mean()}")

    # Define a function to fit model 1 but using cv and grid search to find the best scoring hyperparameters
    def fit_model_1_cv_gs(self, param_grid={}, cv=5, scoring='accuracy'):
        """
        Fit a first model to the training data using cross-validation and grid search.
    
        Parameters:
        - param_grid: A dictionary of hyperparameters to search over.
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for grid search.
        """
        grid_search = GridSearchCV(self.model1, param_grid, cv=cv, scoring=scoring)
        grid_search.fit(self.X_train, self.y_train)
        print(f"Best Score: {grid_search.best_score_}")
        print(f"Best Grid Search Hyperparameters: {grid_search.best_params_}")
        self.model1 = grid_search.best_estimator_

    # Define a function to fit model 2 but using cv and grid search to find the best scoring hyperparameters
    def fit_model_2_cv_gs(self, param_grid={}, cv=5, scoring='accuracy'):
        """
        Fit a second model to the training data using cross-validation and grid search.
    
        Parameters:
        - param_grid: A dictionary of hyperparameters to search over.
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for grid search.
        """
        grid_search = GridSearchCV(self.model2, param_grid, cv=cv, scoring=scoring)
        grid_search.fit(self.X_train, self.y_train)
        print(f"Best Score: {grid_search.best_score_}")
        print(f"Best Grid Search Hyperparameters: {grid_search.best_params_}")
        self.model2 = grid_search.best_estimator_

    # Define a function to fit model 3 but using cv and grid search to find the best scoring hyperparameters
    def fit_model_3_cv_gs(self, param_grid={}, cv=5, scoring='accuracy'):
        """
        Fit a third model to the training data using cross-validation and grid search.
    
        Parameters:
        - param_grid: A dictionary of hyperparameters to search over.
        - cv: The number of cross-validation folds to use.
        - scoring: The scoring metric to use for grid search.
        """
        grid_search = GridSearchCV(self.model3, param_grid, cv=cv, scoring=scoring)
        grid_search.fit(self.X_train, self.y_train)
        print(f"Best Score: {grid_search.best_score_}")
        print(f"Best Grid Search Hyperparameters: {grid_search.best_params_}")
        self.model3 = grid_search.best_estimator_

    # Define a function to evaluate model 1 on the testing data
    def evaluate_model_1(self):
        """
        Evaluate the first model on the testing data.
        """
        if self.problem_type == 'classification':
            y_pred = self.model1.predict(self.X_test)
            print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")
            print(f"Precision: {precision_score(self.y_test, y_pred)}")
            print(f"Recall: {recall_score(self.y_test, y_pred)}")
            print(f"F1 Score: {f1_score(self.y_test, y_pred)}")
            print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred)}")

            # Show the classification report
            print("\nClassification Report:")
            print(classification_report(self.y_test, y_pred))

            # Plot the ROC curve
            # Determine the number of classes first so we can plot the ROC curve accordingly
            temp_n_classes = len(np.unique(self.y_test))
            if temp_n_classes == 2:
                fpr, tpr, _ = roc_curve(self.y_test, y_pred)
                roc_auc = auc(fpr, tpr)
                plt.figure()
                lw = 2
                plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
                plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('Receiver Operating Characteristic Curve')
                plt.legend(loc="lower right")
                plt.show()
        else:
            y_pred = self.model1.predict(self.X_test)
            print(f"Mean Squared Error: {mean_squared_error(self.y_test, y_pred)}")
            print(f"Mean Absolute Error: {mean_absolute_error(self.y_test, y_pred)}")
            print(f"R2 Score: {r2_score(self.y_test, y_pred)}")

    # Define a function to evaluate model 2 on the testing data
    def evaluate_model_2(self):
        """
        Evaluate the second model on the testing data.
        """
        if self.problem_type == 'classification':
            y_pred = self.model2.predict(self.X_test)
            print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")
            print(f"Precision: {precision_score(self.y_test, y_pred)}")
            print(f"Recall: {recall_score(self.y_test, y_pred)}")
            print(f"F1 Score: {f1_score(self.y_test, y_pred)}")
            print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred)}")

            # Show the classification report
            print("\nClassification Report:")
            print(classification_report(self.y_test, y_pred))

            # Plot the ROC curve
            # Determine the number of classes first so we can plot the ROC curve accordingly
            temp_n_classes = len(np.unique(self.y_test))
            if temp_n_classes == 2:
                fpr, tpr, _ = roc_curve(self.y_test, y_pred)
                roc_auc = auc(fpr, tpr)
                plt.figure()
                lw = 2
                plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
                plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('Receiver Operating Characteristic Curve')
                plt.legend(loc="lower right")
                plt.show()
        else:
            y_pred = self.model2.predict(self.X_test)
            print(f"Mean Squared Error: {mean_squared_error(self.y_test, y_pred)}")
            print(f"Mean Absolute Error: {mean_absolute_error(self.y_test, y_pred)}")
            print(f"R2 Score: {r2_score(self.y_test, y_pred)}")

    # Define a function to evaluate model 3 on the testing data
    def evaluate_model_3(self):
        """
        Evaluate the third model on the testing data.
        """
        if self.problem_type == 'classification':
            y_pred = self.model3.predict(self.X_test)
            print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")
            print(f"Precision: {precision_score(self.y_test, y_pred)}")
            print(f"Recall: {recall_score(self.y_test, y_pred)}")
            print(f"F1 Score: {f1_score(self.y_test, y_pred)}")
            print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred)}")

            # Show the classification report
            print("\nClassification Report:")
            print(classification_report(self.y_test, y_pred))

            # Plot the ROC curve
            # Determine the number of classes first so we can plot the ROC curve accordingly
            temp_n_classes = len(np.unique(self.y_test))
            if temp_n_classes == 2:
                fpr, tpr, _ = roc_curve(self.y_test, y_pred)
                roc_auc = auc(fpr, tpr)
                plt.figure()
                lw = 2
                plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
                plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('Receiver Operating Characteristic Curve')
                plt.legend(loc="lower right")
                plt.show()
        else:
            y_pred = self.model3.predict(self.X_test)
            print(f"Mean Squared Error: {mean_squared_error(self.y_test, y_pred)}")
            print(f"Mean Absolute Error: {mean_absolute_error(self.y_test, y_pred)}")
            print(f"R2 Score: {r2_score(self.y_test, y_pred)}")

    # Define a function to fit ensemble 1 to the training data
    def fit_ensemble_1(self):
        """
        Fit the first ensemble to the training data.
        """
        if self.problem_type == 'classification':
            self.Ensemble1 = VotingClassifier(estimators=[('lr', self.model1), ('rf', self.model2), ('xgb', self.model3)])
        else:
            self.Ensemble1 = VotingRegressor(estimators=[('lr', self.model1), ('rf', self.model2), ('xgb', self.model3)])
        self.Ensemble1.fit(self.X_train, self.y_train)

    # Define a function to fit ensemble 2 to the training data
    def fit_ensemble_2(self):
        """
        Fit the second ensemble to the training data.
        """
        if self.problem_type == 'classification':
            self.Ensemble2 = StackingClassifier(estimators=[('lr', self.model1), ('rf', self.model2), ('xgb', self.model3)])
        else:
            self.Ensemble2 = StackingRegressor(estimators=[('lr', self.model1), ('rf', self.model2), ('xgb', self.model3)])
        self.Ensemble2.fit(self.X_train, self.y_train)

    # Define a function to evaluate ensemble 1 on the testing data
    def evaluate_ensemble_1(self):
        """
        Evaluate the first ensemble on the testing data.
        """
        if self.problem_type == 'classification':
            y_pred = self.Ensemble1.predict(self.X_test)
            print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")
            print(f"Precision: {precision_score(self.y_test, y_pred)}")
            print(f"Recall: {recall_score(self.y_test, y_pred)}")
            print(f"F1 Score: {f1_score(self.y_test, y_pred)}")
            print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred)}")
        else:
            y_pred = self.Ensemble1.predict(self.X_test)
            print(f"Mean Squared Error: {mean_squared_error(self.y_test, y_pred)}")
            print(f"Mean Absolute Error: {mean_absolute_error(self.y_test, y_pred)}")
            print(f"R2 Score: {r2_score(self.y_test, y_pred)}")

    # Define a function to evaluate ensemble 2 on the testing data
    def evaluate_ensemble_2(self):
        """
        Evaluate the second ensemble on the testing data.
        """
        if self.problem_type == 'classification':
            y_pred = self.Ensemble2.predict(self.X_test)
            print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")
            print(f"Precision: {precision_score(self.y_test, y_pred)}")
            print(f"Recall: {recall_score(self.y_test, y_pred)}")
            print(f"F1 Score: {f1_score(self.y_test, y_pred)}")
            print(f"ROC AUC Score: {roc_auc_score(self.y_test, y_pred)}")
        else:
            y_pred = self.Ensemble2.predict(self.X_test)
            print(f"Mean Squared Error: {mean_squared_error(self.y_test, y_pred)}")
            print(f"Mean Absolute Error: {mean_absolute_error(self.y_test, y_pred)}")
            print(f"R2 Score: {r2_score(self.y_test, y_pred)}")

    # Define a function for checking for missing values
    def missing(self):
        """
        Check for missing values in the full dataframe and the training and testing data if they exist.
        """
        if self.full_dataframe is not None:
            print("Missing Values in Full Dataframe:")
            print(self.full_dataframe.isnull().sum())

        if self.X_train is not None:
            print("\nMissing Values in X_train:")
            print(self.X_train.isnull().sum())

        if self.X_test is not None:
            print("\nMissing Values in X_test:")
            print(self.X_test.isnull().sum())
            
        if self.y_train is not None:
            print("\nMissing Values in y_train:")
            print(self.y_train.isnull().sum())
            
        if self.y_test is not None:
            print("\nMissing Values in y_test:")
            print(self.y_test.isnull().sum())

    # Define a function for making a simple, interactive histogram.
    def histogram(self, variable):
        """
        Create a simple, interactive histogram with different colored bars using plotly.
    
        Parameters:
        - variable: The variable to plot the histogram for.
        """
        # Create a histogram for the variable in the full dataframe
        if self.full_dataframe is not None:
            fig = px.histogram(self.full_dataframe, x=variable, title=f"Histogram of {variable} for full dataframe")
            fig.show()

        # Create a histogram for the variable in the training data
        if self.X_train is not None:
            fig = px.histogram(self.X_train, x=variable, title=f"Histogram of {variable} for training data")
            fig.show()

        # Create a histogram for the variable in the testing data
        if self.X_test is not None:
            fig = px.histogram(self.X_test, x=variable, title=f"Histogram of {variable} for testing data")
            fig.show()


    # Define a function for making a simple, interactive scatter plot.
    def scatter(self, x, y):
        """
        Create a simple, interactive scatter plot with different colored points using plotly.
    
        Parameters:
        - x: The variable to plot on the x-axis.
        - y: The variable to plot on the y-axis.
        """
        # Create a scatter plot for the full dataframe
        if self.full_dataframe is not None:
            if self.target is not None:
                fig = px.scatter(self.full_dataframe, x=x, y=y, color=self.target, title=f"Scatter plot of {x} vs {y} for full dataframe")
            else:
                fig = px.scatter(self.full_dataframe, x=x, y=y, title=f"Scatter plot of {x} vs {y} for full dataframe")
            fig.show()

        # Create a scatter plot for the training data
        if self.X_train is not None:
            if self.target is not None:
                fig = px.scatter(self.X_train, x=x, y=y, color=self.y_train, title=f"Scatter plot of {x} vs {y} for training data")
            else:
                fig = px.scatter(self.X_train, x=x, y=y, title=f"Scatter plot of {x} vs {y} for training data")
            fig.show()

        # Create a scatter plot for the testing data
        if self.X_test is not None:
            if self.target is not None:
                fig = px.scatter(self.X_test, x=x, y=y, color=self.y_test, title=f"Scatter plot of {x} vs {y} for testing data")
            else:
                fig = px.scatter(self.X_test, x=x, y=y, title=f"Scatter plot of {x} vs {y} for testing data")
            fig.show()

    # Define a function to round all feature values in the full dataframe to a specified number of decimal places
    def round(self, decimals=2):
        """
        Round all values in the full dataframe to a specified number of decimal places.
    
        Parameters:
        - decimals: The number of decimal places to round to.
        """
        if self.full_dataframe is not None:
            self.full_dataframe = self.full_dataframe.round(decimals)

        if self.X_train is not None:
            self.X_train = self.X_train.round(decimals)

        if self.X_test is not None:
            self.X_test = self.X_test.round(decimals)

        if self.y_train is not None:
            self.y_train = self.y_train.round(decimals)

        if self.y_test is not None:
            self.y_test = self.y_test.round(decimals)

    # Define a function for common transforms like boxcox, log, sqrt, etc.
    def transform(self, method='boxcox', columns=[], custom_params={}):
        """
        Transform the data using a specified method.
    
        Parameters:
        - method: The transformation method to use.
        - columns: A list of columns to transform.
        - custom_params: A dictionary of custom parameters to use when transforming the data.
        """

        if method == 'boxcox':
            transformer = PowerTransformer(method='box-cox', **custom_params)
        elif method == 'log':
            transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=True)
        elif method == 'sqrt':
            transformer = FunctionTransformer(func=np.sqrt, inverse_func=np.square, validate=True)
        elif method == 'cube':
            transformer = FunctionTransformer(func=np.cbrt, inverse_func=lambda x: x**3, validate=True)
        elif method == 'yeo-johnson':
            transformer = PowerTransformer(method='yeo-johnson', **custom_params)

        # Apply the transformation to the columns in the full dataframe
        if self.full_dataframe is not None:
            self.full_dataframe.loc[:, columns] = transformer.fit_transform(self.full_dataframe.loc[:, columns])

        # Apply the transformation to the columns in the training data
        if self.X_train is not None:
            self.X_train.loc[:, columns] = transformer.fit_transform(self.X_train.loc[:, columns])

        # Apply the transformation to the columns in the testing data
        if self.X_test is not None:
            self.X_test.loc[:, columns] = transformer.transform(self.X_test.loc[:, columns])

    # Define a function to remove outliers
    def outliers(self, method='zscore', apply_to_test_data=True, custom_params={}):
        """
        Remove outliers from the data using a specified method, while ensuring corresponding rows in target data are also removed.
        """
        # Define a helper function for filtering using IQR and capturing indices
        def filter_iqr(df, Q1, Q3, IQR):
            # Ensure alignment before condition calculation
            aligned_df, _ = df.align(Q1, axis=1, join='inner', copy=False)
            condition = ((aligned_df < (Q1 - 1.5 * IQR)) | (aligned_df > (Q3 + 1.5 * IQR)))
            filtered_df = df[~condition.any(axis=1)]
            return filtered_df, filtered_df.index

        # Log the number of rows before outlier removal
        temp_n_rows_full = self.full_dataframe.shape[0] if self.full_dataframe is not None else 0
        temp_n_rows_x_train = self.X_train.shape[0] if self.X_train is not None else 0
        temp_n_rows_x_test = self.X_test.shape[0] if self.X_test is not None else 0
        temp_n_rows_y_train = self.y_train.shape[0] if self.y_train is not None else 0
        temp_n_rows_y_test = self.y_test.shape[0] if self.y_test is not None else 0

        # Z-Score method
        if method == 'zscore':
            if custom_params == {}:
                custom_params = {'threshold': 3}
            # Full dataframe
            if self.full_dataframe is not None:
                # if the target is none
                if self.target is None:
                    z = np.abs(zscore(self.full_dataframe))
                    self.full_dataframe = self.full_dataframe[(z < custom_params['threshold']).all(axis=1)]
                    # if the target exists
                else:
                    z = np.abs(zscore(self.full_dataframe.drop(columns=[self.target]) if self.target else self.full_dataframe))
                    self.full_dataframe = self.full_dataframe[(z < custom_params['threshold']).all(axis=1)]
            # X_train
            if self.X_train is not None:
                z = np.abs(zscore(self.X_train))
                retained_indices = self.X_train.index[(z < custom_params['threshold']).all(axis=1)]
                self.X_train = self.X_train.loc[retained_indices]
                self.y_train = self.y_train.loc[retained_indices]
            # X_test
            if self.X_test is not None and apply_to_test_data:
                z = np.abs(zscore(self.X_test))
                retained_indices = self.X_test.index[(z < custom_params['threshold']).all(axis=1)]
                self.X_test = self.X_test.loc[retained_indices]
                if hasattr(self, 'y_test'):  # Check if y_test exists
                    self.y_test = self.y_test.loc[retained_indices]

        # IQR method
        elif method == 'iqr':
            if custom_params == {}:
                custom_params = {'q1': 0.25, 'q3': 0.75}
            # Full dataframe
            if self.full_dataframe is not None:
                # Temporarily remove target column if it exists
                df_temp = self.full_dataframe.drop(columns=[self.target]) if self.target else self.full_dataframe.copy()
                Q1 = df_temp.quantile(custom_params['q1'])
                Q3 = df_temp.quantile(custom_params['q3'])
                IQR = Q3 - Q1
                # Filter using modified dataframe but ensure the original dataframe is updated correctly
                filtered_df, _ = filter_iqr(df_temp, Q1, Q3, IQR)
                # If the target was removed, use the indices to select rows from the original dataframe
                self.full_dataframe = self.full_dataframe.loc[filtered_df.index] if self.target else filtered_df
            # X_train
            if self.X_train is not None:
                Q1 = self.X_train.quantile(custom_params['q1'])
                Q3 = self.X_train.quantile(custom_params['q3'])
                IQR = Q3 - Q1
                self.X_train, retained_indices = filter_iqr(self.X_train, Q1, Q3, IQR)
                self.y_train = self.y_train.loc[retained_indices]
            # X_test
            if self.X_test is not None and apply_to_test_data:
                Q1 = self.X_test.quantile(custom_params['q1'])
                Q3 = self.X_test.quantile(custom_params['q3'])
                IQR = Q3 - Q1
                self.X_test, retained_indices = filter_iqr(self.X_test, Q1, Q3, IQR)
                if hasattr(self, 'y_test'):  # Check if y_test exists
                    self.y_test = self.y_test.loc[retained_indices]

        # Isolation Forest method
        elif method == 'isolation':
            if custom_params == {}:
                custom_params = {'contamination': 0.1}
            # Full dataframe
            if self.full_dataframe is not None:
                # if the target is none
                if self.target is None:
                    isolation_forest = IsolationForest(contamination=custom_params['contamination'])
                    self.full_dataframe = self.full_dataframe[isolation_forest.fit_predict(self.full_dataframe) == 1]
                # if the target exists
                else:
                    isolation_forest = IsolationForest(contamination=custom_params['contamination'])
                    self.full_dataframe = self.full_dataframe[isolation_forest.fit_predict(self.full_dataframe.drop(columns=[self.target])) == 1]
            # X_train
            if self.X_train is not None:
                isolation_forest = IsolationForest(contamination=custom_params['contamination'])
                retained_indices = isolation_forest.fit_predict(self.X_train) == 1
                self.X_train = self.X_train.loc[retained_indices]
                self.y_train = self.y_train.loc[retained_indices]
            # X_test
            if self.X_test is not None and apply_to_test_data:
                isolation_forest = IsolationForest(contamination=custom_params['contamination'])
                retained_indices = isolation_forest.fit_predict(self.X_test) == 1
                self.X_test = self.X_test.loc[retained_indices]
                if hasattr(self, 'y_test'):  # Check if y_test exists
                    self.y_test = self.y_test.loc[retained_indices]

        # Local Outlier Factor method
        elif method == 'lof':
            if custom_params == {}:
                custom_params = {'contamination': 0.1}
            # Full dataframe
            if self.full_dataframe is not None:
                # if the target is none
                if self.target is None:
                    lof = LocalOutlierFactor(contamination=custom_params['contamination'])
                    self.full_dataframe = self.full_dataframe[lof.fit_predict(self.full_dataframe) == 1]
                # if the target exists
                else:
                    lof = LocalOutlierFactor(contamination=custom_params['contamination'])
                    self.full_dataframe = self.full_dataframe[lof.fit_predict(self.full_dataframe.drop(columns=[self.target])) == 1]
            # X_train
            if self.X_train is not None:
                lof = LocalOutlierFactor(contamination=custom_params['contamination'])
                retained_indices = lof.fit_predict(self.X_train) == 1
                self.X_train = self.X_train.loc[retained_indices]
                self.y_train = self.y_train.loc[retained_indices]
            # X_test
            if self.X_test is not None and apply_to_test_data:
                lof = LocalOutlierFactor(contamination=custom_params['contamination'])
                retained_indices = lof.fit_predict(self.X_test) == 1
                self.X_test = self.X_test.loc[retained_indices]
                if hasattr(self, 'y_test'):  # Check if y_test exists
                    self.y_test = self.y_test.loc[retained_indices]

        # Determine and print the number of rows removed from each dataset
        print(f"Number of Rows Removed from Full Dataframe: {temp_n_rows_full - self.full_dataframe.shape[0] if self.full_dataframe is not None else 0}")
        print(f"Number of Rows Removed from X_train: {temp_n_rows_x_train - self.X_train.shape[0] if self.X_train is not None else 0}")
        print(f"Number of Rows Removed from X_test: {temp_n_rows_x_test - self.X_test.shape[0] if self.X_test is not None else 0}")
        print(f"Number of Rows Removed from y_train: {temp_n_rows_y_train - self.y_train.shape[0] if self.y_train is not None else 0}")
        print(f"Number of Rows Removed from y_test: {temp_n_rows_y_test - self.y_test.shape[0] if self.y_test is not None else 0}")