In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
def eda_process(df):
    """
    Process the input DataFrame by counting the number of empty strings in each column, 
    printing these counts, and then dropping specified columns or any columns with more 
    than 50,000 empty strings.

    This function performs the following tasks:
    1. Counts and prints the number of empty strings ('') in each column of the DataFrame.
    2. Drops any columns with more than 50,000 empty strings.
    3. Additionally, drops a predefined set of columns from the DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to be processed.

    Returns:
    pandas.DataFrame: The modified DataFrame with specified columns and any columns with 
                      excessive empty strings dropped.

    Note:
    This function modifies the input DataFrame in-place. Therefore, the original DataFrame 
    passed to this function will be altered.

    Example usage:
    >>> processed_df = process_dataframe(your_dataframe)
    """

    def count_empty_strings(column):
        return (column == '').sum()

    # Count empty strings in each column
    empty_string_counts = df.apply(count_empty_strings)

    # Print the counts of empty strings
    print(empty_string_counts)

    # Drop columns with more than 50,000 empty strings
    columns_to_drop = [col for col, count in empty_string_counts.items() if count > 50000]

    # Add additional specific columns to drop
    columns_to_drop.extend(['echoBuffer', 'merchantCity', 'merchantZip', 'posOnPremises', 'recurringAuthInd', 'merchantState'])

    # Drop the columns
    df.drop(columns=columns_to_drop, axis=1, inplace=True)

    return df


def create_feature_plots(df):
    # Selecting numerical and categorical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object', 'bool']).columns

   
    num_numerical = len(numerical_features)
    num_categorical = len(categorical_features)
    total_plots = num_numerical + num_categorical
    num_rows = (total_plots + 1) // 2

   
    fig, axes = plt.subplots(num_rows, 2, figsize=(15, 5 * num_rows))
    axes = axes.ravel()  # Flatten the axes array

    
    for i, col in enumerate(numerical_features):
        sns.histplot(df[col], ax=axes[i], kde=False, bins=30)
        axes[i].set_title(f'Histogram of {col}')
        axes[i].set_ylabel('Count')

    
    for j, col in enumerate(categorical_features, start=num_numerical):
        counts = df[col].value_counts().nlargest(10)  # Top 10 categories
        sns.barplot(x=counts.index, y=counts.values, ax=axes[j])
        axes[j].set_title(f'Frequency of Top 10 {col}')
        axes[j].set_xticklabels(axes[j].get_xticklabels(), rotation=45)
        axes[j].set_ylabel('Count')

    # Adjusting the layout
    plt.tight_layout()
    plt.show()