In [1]:
!pip install tabulate



In [2]:
import pandas as pd

def aggregate_abm_data(file_path):
    """
    Reads ABM data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the ABM data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        # Read the ABM data into a pandas DataFrame. Adjust the 'sep' parameter if your file uses a different delimiter.
        abm_data = pd.read_csv(file_path, sep=',')  # Assuming CSV, change if needed

        # Ensure required columns exist. Handle cases where columns might be named slightly differently.
        if not all(col in abm_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # Aggregate transactions by customer ID
        aggregated_data = abm_data.groupby('customer_id').agg(
            abm_credit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            abm_debit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        # Handle potential missing values after aggregation
        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_card_data(file_path):
    """
    Reads Card data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Card data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        card_data = pd.read_csv(file_path, sep=',')

        if not all(col in card_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = card_data.groupby('customer_id').agg(
            card_credit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            card_debit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_eft_data(file_path):
    """
    Reads EFT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EFT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        eft_data = pd.read_csv(file_path, sep=',')

        if not all(col in eft_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = eft_data.groupby('customer_id').agg(
            eft_credit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            eft_debit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_cheque_data(file_path):
    """
    Reads Cheque data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Cheque data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        cheque_data = pd.read_csv(file_path, sep=',')

        if not all(col in cheque_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = cheque_data.groupby('customer_id').agg(
            cheque_credit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            cheque_debit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_emt_data(file_path):
    """
    Reads EMT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EMT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        emt_data = pd.read_csv(file_path, sep=',')

        if not all(col in emt_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
        aggregated_data = emt_data.groupby('customer_id').agg(
            emt_credit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'C'].sum()),
            emt_debit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'D'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_wire_data(file_path):
    """
    Reads Wire data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Wire data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        wire_data = pd.read_csv(file_path, sep=',')

        if not all(col in wire_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = wire_data.groupby('customer_id').agg(
            wire_credit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            wire_debit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Example usage:
file_path_abm = 'abm.csv'  # Replace with your actual file path
file_path_card = 'card.csv'
file_path_eft = 'eft.csv'
file_path_cheque = 'cheque.csv'
file_path_emt = 'emt.csv'
file_path_wire = 'wire.csv'
df_kyc=pd.read_csv('kyc.csv')
df_kyc_industry_codes=pd.read_csv('kyc_industry_codes.csv')

aggregated_abm_df = aggregate_abm_data(file_path_abm)
aggregated_card_df = aggregate_card_data(file_path_card)
aggregated_eft_df = aggregate_eft_data(file_path_eft)
aggregated_cheque_df = aggregate_cheque_data(file_path_cheque)
aggregated_emt_df = aggregate_emt_data(file_path_emt)
aggregated_wire_df = aggregate_wire_data(file_path_wire)

# Combine the dataframes
from functools import reduce

if all(df is not None for df in [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df]):
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=['customer_id'], how='outer'),
                         [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df])
    combined_df.fillna(0, inplace=True)  # Handle any missing values after merging

    # Convert industry_code in df_kyc to numeric
    df_kyc['industry_code'] = pd.to_numeric(df_kyc['industry_code'], errors='coerce')

    # Merge the KYC and industry information into the aggregated dataframe
    combined_df = pd.merge(combined_df, df_kyc, on='customer_id', how='left')
    combined_df = pd.merge(combined_df, df_kyc_industry_codes, on='industry_code', how='left')

    print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
    print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | country   | province   | city       | industry_code   | employee_count   | sales   | established_date   | onboard_date   | industry                          |
|:-----------------|:-------------|:------------|:--------------|:-------------|:-------------|:------------|:----------------|:---------------|:-------------|:------------|:--------------|:-------------|:----------|:-----------|:-----------|:----------------|:-----------------|:--------|:-------------------|:---------------|:----------------------------------|
| SYNCID0000000000 | 0            | 0           | 0             | 0            | 0            | 0           | 0               | 415.24         | 0            | 0           | 0.03          | 6316.04      | CA        | ON         | NORTH YORK | 7292            | 0          

In [3]:
# Payment methods for iteration
payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# Iterate through payment methods to handle the removal of rows where both credit and debit are zero
for method in payment_methods:
    credit_col = f'{method}_credit'
    debit_col = f'{method}_debit'

    # Identify rows where both credit and debit are 0 for the current method
    rows_to_remove = combined_df[(combined_df[credit_col] == 0) & (combined_df[debit_col] == 0)]

    # Remove these rows from combined_df
    combined_df = combined_df.drop(rows_to_remove.index)

# Calculate credit debit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_credit_debit_ratio'] = combined_df['abm_credit'] / (combined_df['abm_debit'] )
combined_df['card_credit_debit_ratio'] = combined_df['card_credit'] / (combined_df['card_debit'] )
combined_df['cheque_credit_debit_ratio'] = combined_df['cheque_credit'] / (combined_df['cheque_debit'] )
combined_df['eft_credit_debit_ratio'] = combined_df['eft_credit'] / (combined_df['eft_debit'] )
combined_df['emt_credit_debit_ratio'] = combined_df['emt_credit'] / (combined_df['emt_debit'] )
combined_df['wire_credit_debit_ratio'] = combined_df['wire_credit'] / (combined_df['wire_debit'] )

# Calculate debit credit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_debit_credit_ratio'] = combined_df['abm_debit'] / (combined_df['abm_credit'])
combined_df['card_debit_credit_ratio'] = combined_df['card_debit'] / (combined_df['card_credit'])
combined_df['cheque_debit_credit_ratio'] = combined_df['cheque_debit'] / (combined_df['cheque_credit'])
combined_df['eft_debit_credit_ratio'] = combined_df['eft_debit'] / (combined_df['eft_credit'])
combined_df['emt_debit_credit_ratio'] = combined_df['emt_debit'] / (combined_df['emt_credit'])
combined_df['wire_debit_credit_ratio'] = combined_df['wire_debit'] / (combined_df['wire_credit'])

# Replace infinite values with 0 (this line is still good as a general cleanup)
combined_df.replace([float('inf'), float('-inf')], 0, inplace=True)

In [4]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
34     SYNCID0000000038        0.00     104.55       -71.46    31861.18   
58     SYNCID0000000063     1050.38      52.66         0.00      106.98   
262    SYNCID0000000276    12626.19    5069.09         0.00     5726.39   
845    SYNCID0000000890    55053.74    2047.86      -200.42    21480.28   
870    SYNCID0000000917      103.71     691.12      9633.76    26585.04   
...                 ...         ...        ...          ...         ...   
15366  SYNCID0000016272      526.68     215.34      3436.52    12320.55   
15370  SYNCID0000016276      214.91     344.76      1713.94    60157.64   
15373  SYNCID0000016281      820.47    1454.74      -177.71     7627.44   
15520  SYNCID0000016435        0.00     898.44     -1041.23    18895.42   
15604  SYNCID0000016523        0.00     106.52      -118.30    17443.51   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
34       10910.03   1

In [5]:
print(combined_df[combined_df['customer_id'] == 'SYNCID0000005244'])

           customer_id  abm_credit  abm_debit  card_credit  card_debit  \
4953  SYNCID0000005244     4235.09     886.93     22699.75    79740.08   

      eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
4953     1523.06    3984.26       10109.06       2750.65    31901.87  ...   

      cheque_credit_debit_ratio  eft_credit_debit_ratio  \
4953                   3.675153                0.382269   

      emt_credit_debit_ratio wire_credit_debit_ratio abm_debit_credit_ratio  \
4953                0.648714                0.471693               0.209424   

     card_debit_credit_ratio  cheque_debit_credit_ratio  \
4953                3.512818                   0.272098   

      eft_debit_credit_ratio  emt_debit_credit_ratio wire_debit_credit_ratio  
4953                2.615957                 1.54151                2.120021  

[1 rows x 34 columns]


In [6]:
import pandas as pd

def aggregate_abm_data(file_path):
    """
    Reads ABM data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the ABM data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        # Read the ABM data into a pandas DataFrame. Adjust the 'sep' parameter if your file uses a different delimiter.
        abm_data = pd.read_csv(file_path, sep=',')  # Assuming CSV, change if needed

        # Ensure required columns exist. Handle cases where columns might be named slightly differently.
        if not all(col in abm_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # Aggregate transactions by customer ID
        aggregated_data = abm_data.groupby('customer_id').agg(
            abm_credit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            abm_debit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        # Handle potential missing values after aggregation
        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_card_data(file_path):
    """
    Reads Card data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Card data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        card_data = pd.read_csv(file_path, sep=',')

        if not all(col in card_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = card_data.groupby('customer_id').agg(
            card_credit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            card_debit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_eft_data(file_path):
    """
    Reads EFT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EFT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        eft_data = pd.read_csv(file_path, sep=',')

        if not all(col in eft_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = eft_data.groupby('customer_id').agg(
            eft_credit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            eft_debit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_cheque_data(file_path):
    """
    Reads Cheque data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Cheque data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        cheque_data = pd.read_csv(file_path, sep=',')

        if not all(col in cheque_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = cheque_data.groupby('customer_id').agg(
            cheque_credit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            cheque_debit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_emt_data(file_path):
    """
    Reads EMT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EMT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        emt_data = pd.read_csv(file_path, sep=',')

        if not all(col in emt_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
        aggregated_data = emt_data.groupby('customer_id').agg(
            emt_credit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'C'].sum()),
            emt_debit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'D'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_wire_data(file_path):
    """
    Reads Wire data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Wire data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        wire_data = pd.read_csv(file_path, sep=',')

        if not all(col in wire_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = wire_data.groupby('customer_id').agg(
            wire_credit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            wire_debit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [7]:
def calculate_abm_stats(df):
    """
    Calculates ABM statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing ABM transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added ABM statistics columns.
    """
    df_abm = pd.read_csv('abm.csv')  # Reload ABM data
    df_abm['transaction_date'] = pd.to_datetime(df_abm['transaction_date'])

    # Group by customer_id and calculate ABM statistics
    df_abm_grouped = df_abm.groupby('customer_id').agg(
        abm_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        abm_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        abm_average_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        abm_average_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_abm_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_abm_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_abm_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_abm_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    # Calculate account active days
    df_abm_grouped_days = df_abm.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='abm_account_active_days')

    # Merge the calculated statistics into the combined dataframe
    df = pd.merge(df, df_abm_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_abm_grouped_days, on='customer_id', how='left')

    return df


def calculate_card_stats(df):
    """
    Calculates Card statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Card transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Card statistics columns.
    """
    df_card = pd.read_csv('card.csv')
    df_card['transaction_date'] = pd.to_datetime(df_card['transaction_date'])

    df_card_grouped = df_card.groupby('customer_id').agg(
        card_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        card_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        card_average_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        card_average_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_card_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_card_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_card_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_card_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_card_grouped_days = df_card.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='card_account_active_days')

    df = pd.merge(df, df_card_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_card_grouped_days, on='customer_id', how='left')

    return df


def calculate_eft_stats(df):
    """
    Calculates EFT statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing EFT transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added EFT statistics columns.
    """
    df_eft = pd.read_csv('eft.csv')
    df_eft['transaction_date'] = pd.to_datetime(df_eft['transaction_date'])

    df_eft_grouped = df_eft.groupby('customer_id').agg(
        eft_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        eft_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        eft_average_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        eft_average_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_eft_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_eft_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_eft_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_eft_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_eft_grouped_days = df_eft.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='eft_account_active_days')

    df = pd.merge(df, df_eft_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_eft_grouped_days, on='customer_id', how='left')

    return df


def calculate_cheque_stats(df):
    """
    Calculates Cheque statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Cheque transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Cheque statistics columns.
    """
    df_cheque = pd.read_csv('cheque.csv')
    df_cheque['transaction_date'] = pd.to_datetime(df_cheque['transaction_date'])

    df_cheque_grouped = df_cheque.groupby('customer_id').agg(
        cheque_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        cheque_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        cheque_average_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        cheque_average_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_cheque_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_cheque_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_cheque_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_cheque_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_cheque_grouped_days = df_cheque.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='cheque_account_active_days')

    df = pd.merge(df, df_cheque_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_cheque_grouped_days, on='customer_id', how='left')

    return df

def calculate_emt_stats(df):
    """
    Calculates EMT statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing EMT transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added EMT statistics columns.
    """
    df_emt = pd.read_csv('emt.csv')
    df_emt['transaction_date'] = pd.to_datetime(df_emt['transaction_date'])

    # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
    df_emt_grouped = df_emt.groupby('customer_id').agg(
        emt_credit_transaction_count=('debit_credit', lambda x: (x == 'C').sum()),
        emt_debit_transaction_count=('debit_credit', lambda x: (x == 'D').sum()),
        emt_average_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].mean()),
        emt_average_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].mean()),
        max_emt_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].max()),
        max_emt_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].max()),
        min_emt_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].min()),
        min_emt_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].min())
    ).reset_index()

    df_emt_grouped_days = df_emt.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='emt_account_active_days')

    df = pd.merge(df, df_emt_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_emt_grouped_days, on='customer_id', how='left')

    return df


def calculate_wire_stats(df):
    """
    Calculates Wire statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Wire transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Wire statistics columns.
    """
    df_wire = pd.read_csv('wire.csv')
    df_wire['transaction_date'] = pd.to_datetime(df_wire['transaction_date'])

    df_wire_grouped = df_wire.groupby('customer_id').agg(
        wire_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        wire_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        wire_average_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        wire_average_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_wire_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_wire_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_wire_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_wire_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_wire_grouped_days = df_wire.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='wire_account_active_days')

    df = pd.merge(df, df_wire_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_wire_grouped_days, on='customer_id', how='left')

    return df


def calculate_rolling_stats(df):
    """
    Calculates rolling statistics for each transaction type:
    - Rolling average for 30 days and 7 days
    - Rolling sum ratio for 7 days
    - Active days ratio

    Args:
        df (pandas.DataFrame): The combined DataFrame containing all transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added rolling statistics columns.
    """

    # ABM Rolling Stats
    df['abm_amount_rolling_avg_30d'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['abm_amount_rolling_avg_7d'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_abm_amount'] = (df['abm_credit'] + df['abm_debit']) / (
        df['abm_credit_transaction_count'] + df['abm_debit_transaction_count']
    )
    df['abm_rolling_sum_7d_ratio'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['abm_credit'] + df['abm_debit'])
    df['abm_active_days_ratio'] = df['abm_account_active_days'] / (
        df['abm_credit_transaction_count'] + df['abm_debit_transaction_count']
    )

    # Card Rolling Stats
    df['card_amount_rolling_avg_30d'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['card_amount_rolling_avg_7d'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_card_amount'] = (df['card_credit'] + df['card_debit']) / (
        df['card_credit_transaction_count'] + df['card_debit_transaction_count']
    )
    df['card_rolling_sum_7d_ratio'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['card_credit'] + df['card_debit'])
    df['card_active_days_ratio'] = df['card_account_active_days'] / (
        df['card_credit_transaction_count'] + df['card_debit_transaction_count']
    )

    # EFT Rolling Stats
    df['eft_amount_rolling_avg_30d'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['eft_amount_rolling_avg_7d'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_eft_amount'] = (df['eft_credit'] + df['eft_debit']) / (
        df['eft_credit_transaction_count'] + df['eft_debit_transaction_count']
    )
    df['eft_rolling_sum_7d_ratio'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['eft_credit'] + df['eft_debit'])
    df['eft_active_days_ratio'] = df['eft_account_active_days'] / (
        df['eft_credit_transaction_count'] + df['eft_debit_transaction_count']
    )

    # Cheque Rolling Stats
    df['cheque_amount_rolling_avg_30d'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['cheque_amount_rolling_avg_7d'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_cheque_amount'] = (df['cheque_credit'] + df['cheque_debit']) / (
        df['cheque_credit_transaction_count'] + df['cheque_debit_transaction_count']
    )
    df['cheque_rolling_sum_7d_ratio'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['cheque_credit'] + df['cheque_debit'])
    df['cheque_active_days_ratio'] = df['cheque_account_active_days'] / (
        df['cheque_credit_transaction_count'] + df['cheque_debit_transaction_count']
    )

    # EMT Rolling Stats
    df['emt_amount_rolling_avg_30d'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['emt_amount_rolling_avg_7d'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_emt_amount'] = (df['emt_credit'] + df['emt_debit']) / (
        df['emt_credit_transaction_count'] + df['emt_debit_transaction_count']
    )
    df['emt_rolling_sum_7d_ratio'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['emt_credit'] + df['emt_debit'])
    df['emt_active_days_ratio'] = df['emt_account_active_days'] / (
        df['emt_credit_transaction_count'] + df['emt_debit_transaction_count']
    )

    # Wire Rolling Stats
    df['wire_amount_rolling_avg_30d'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['wire_amount_rolling_avg_7d'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_wire_amount'] = (df['wire_credit'] + df['wire_debit']) / (
        df['wire_credit_transaction_count'] + df['wire_debit_transaction_count']
    )
    df['wire_rolling_sum_7d_ratio'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['wire_credit'] + df['wire_debit'])
    df['wire_active_days_ratio'] = df['wire_account_active_days'] / (
        df['wire_credit_transaction_count'] + df['wire_debit_transaction_count']
    )

    return df

# Example usage:
file_path_abm = 'abm.csv'  # Replace with your actual file path
file_path_card = 'card.csv'
file_path_eft = 'eft.csv'
file_path_cheque = 'cheque.csv'
file_path_emt = 'emt.csv'
file_path_wire = 'wire.csv'


In [8]:

aggregated_abm_df = aggregate_abm_data(file_path_abm)
aggregated_card_df = aggregate_card_data(file_path_card)
aggregated_eft_df = aggregate_eft_data(file_path_eft)
aggregated_cheque_df = aggregate_cheque_data(file_path_cheque)
aggregated_emt_df = aggregate_emt_data(file_path_emt)
aggregated_wire_df = aggregate_wire_data(file_path_wire)

In [9]:
# Combine the dataframes
from functools import reduce

if all(df is not None for df in [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df]):
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=['customer_id'], how='outer'),
                         [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df])
    combined_df.fillna(0, inplace=True)  # Handle any missing values after merging

    # Calculate statistics for all transaction types
    combined_df = calculate_abm_stats(combined_df)
    combined_df = calculate_card_stats(combined_df)
    combined_df = calculate_eft_stats(combined_df)
    combined_df = calculate_cheque_stats(combined_df)
    combined_df = calculate_emt_stats(combined_df)
    combined_df = calculate_wire_stats(combined_df)

    # Calculate rolling statistics
    combined_df = calculate_rolling_stats(combined_df)

    print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
    print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [10]:
import pandas as pd
import numpy as np

def add_kyc_data(df):
    """
    Adds KYC and industry information to the combined dataframe.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added KYC and industry information.
    """
    df_kyc = pd.read_csv('kyc.csv')
    df_kyc_industry_codes = pd.read_csv('kyc_industry_codes.csv')
    # Check if 'industry_code' exists in df before merging
    if 'industry_code' in df.columns:
        # Ensure 'industry_code' in both dataframes is of the same type
        df['industry_code'] = df['industry_code'].astype(float)
        df_kyc_industry_codes['industry_code'] = df_kyc_industry_codes['industry_code'].astype(float)

        # Merge the industry information into the aggregated dataframe
        df = pd.merge(df, df_kyc_industry_codes, on='industry_code', how='left')
    else:
        print("Warning: 'industry_code' not found in combined_df. Skipping industry code merge.")

    # Merge the KYC  information into the aggregated dataframe
    df = pd.merge(df, df_kyc, on='customer_id', how='left') #This line was missing
    return df

In [11]:
# Add KYC data to combined_df
combined_df = add_kyc_data(combined_df)

# Print the first 5 rows of the combined dataframe with KYC data
print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [12]:
import pandas as pd
import numpy as np

payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# --- 1. Calculate log1p(account_active_days) for each transaction type ---

for method in payment_methods:
    active_days_col = f'{method}_account_active_days'
    log1p_active_days_col = f'log1p_{method}_account_active_days'

    # Apply log1p transformation to the 'account_active_days' column
    # Handle potential missing values (NaN) by filling with 0 before log1p
    combined_df[log1p_active_days_col] = combined_df[active_days_col].fillna(0).apply(np.log1p)


# --- 2. Interaction terms: log1p(account_active_days) * (total_transaction_amount / kyc_sales) ---

for method in payment_methods:
    recency_proxy_col = f'log1p_{method}_account_active_days' # Using active days as proxy for recency
    # Fix: Correctly concatenate column names for total volume
    volume_col = f'{method}_credit'  # Start with the credit column
    volume_col_debit = f'{method}_debit'
    interaction_col = f'interaction_active_days_volume_kyc_sales_{method}' # Descriptive interaction column name

    combined_df[interaction_col] = 0.0 # Initialize interaction column

    for index, row in combined_df.iterrows():
        recency_proxy_value = row[recency_proxy_col]
        # Fix: Access credit and debit columns separately and sum them
        volume_value = row[volume_col] + row[volume_col_debit]
        kyc_sales_value = row['sales'] # Assuming 'kyc_sales' column exists

        if pd.notna(recency_proxy_value) and pd.notna(volume_value) and pd.notna(kyc_sales_value):
            if kyc_sales_value != 0:
                interaction_value = recency_proxy_value * (volume_value / kyc_sales_value)
                combined_df.at[index, interaction_col] = interaction_value
            else:
                combined_df.at[index, interaction_col] = 0.0 # If kyc_sales is zero, interaction is zero



# Display the newly created columns (first few rows)
print(combined_df[[col for col in combined_df.columns if 'log1p_' in col or 'interaction_active_days' in col]].head())

   log1p_abm_account_active_days  log1p_card_account_active_days  \
0                            0.0                        0.000000   
1                            0.0                        3.891820   
2                            0.0                        0.000000   
3                            0.0                        4.465908   
4                            0.0                        0.000000   

   log1p_cheque_account_active_days  log1p_eft_account_active_days  \
0                          0.000000                       0.000000   
1                          0.000000                       0.000000   
2                          4.477337                       0.000000   
3                          0.000000                       0.000000   
4                          0.000000                       4.521789   

   log1p_emt_account_active_days  log1p_wire_account_active_days  \
0                            0.0                         3.78419   
1                            0.0  

In [13]:
import pandas as pd
import numpy as np

def calculate_card_stats(df):
    """
    Calculates card-specific statistics and adds them to the DataFrame.
    """
    # Read the card data again to access merchant_category and ecommerce_ind
    card_data = pd.read_csv('card.csv')

    # Calculate num_unique_card_merchant_categories
    # Group by 'customer_id' and get the number of unique merchant categories
    num_unique_categories = card_data.groupby('customer_id')['merchant_category'].nunique().reset_index()
    # Rename the column to 'num_unique_card_merchant_categories'
    num_unique_categories = num_unique_categories.rename(columns={'merchant_category': 'num_unique_card_merchant_categories'})

    # Merge num_unique_categories into df
    df = pd.merge(df, num_unique_categories, on='customer_id', how='left')

    # Calculate pct_card_ecommerce
    # Group by 'customer_id' and calculate the percentage of ecommerce transactions
    pct_ecommerce = card_data.groupby('customer_id')['ecommerce_ind'].mean().reset_index()
    # Rename the column to 'pct_card_ecommerce'
    pct_ecommerce = pct_ecommerce.rename(columns={'ecommerce_ind': 'pct_card_ecommerce'})

    # Merge pct_ecommerce into df
    df = pd.merge(df, pct_ecommerce, on='customer_id', how='left')

    return df

# Assuming 'combined_df' is already defined and populated
combined_df = calculate_card_stats(combined_df)

In [14]:
import numpy as np

# 1. log1p(kyc_employee_count)
combined_df['log1p_kyc_employee_count'] = np.log1p(combined_df['employee_count'])

# Display the updated dataframe
print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(combined_df.info())


| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [15]:
# Payment methods for iteration
payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# Calculate credit debit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_credit_debit_ratio'] = combined_df['abm_credit'] / (combined_df['abm_debit'])
combined_df['card_credit_debit_ratio'] = combined_df['card_credit'] / (combined_df['card_debit'])
combined_df['cheque_credit_debit_ratio'] = combined_df['cheque_credit'] / (combined_df['cheque_debit'])
combined_df['eft_credit_debit_ratio'] = combined_df['eft_credit'] / (combined_df['eft_debit'])
combined_df['emt_credit_debit_ratio'] = combined_df['emt_credit'] / (combined_df['emt_debit'])
combined_df['wire_credit_debit_ratio'] = combined_df['wire_credit'] / (combined_df['wire_debit'])

# Calculate debit credit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_debit_credit_ratio'] = combined_df['abm_debit'] / (combined_df['abm_credit'])
combined_df['card_debit_credit_ratio'] = combined_df['card_debit'] / (combined_df['card_credit'])
combined_df['cheque_debit_credit_ratio'] = combined_df['cheque_debit'] / (combined_df['cheque_credit'])
combined_df['eft_debit_credit_ratio'] = combined_df['eft_debit'] / (combined_df['eft_credit'])
combined_df['emt_debit_credit_ratio'] = combined_df['emt_debit'] / (combined_df['emt_credit'])
combined_df['wire_debit_credit_ratio'] = combined_df['wire_debit'] / (combined_df['wire_credit'])

# Replace infinite values with 0 (this line is still good as a general cleanup)
combined_df.replace([float('inf'), float('-inf')], 0, inplace=True)

In [16]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
0      SYNCID0000000000         0.0        0.0         0.00        0.00   
1      SYNCID0000000001         0.0        0.0         0.00      291.39   
2      SYNCID0000000002         0.0        0.0         0.00        0.00   
3      SYNCID0000000004         0.0        0.0      8805.86     7534.12   
4      SYNCID0000000005         0.0        0.0         0.00        0.00   
...                 ...         ...        ...          ...         ...   
16221  SYNCID0000017178         0.0        0.0         0.00        0.00   
16222  SYNCID0000017179         0.0        0.0         0.00        0.00   
16223  SYNCID0000017180         0.0        0.0         0.00        0.00   
16224  SYNCID0000017181         0.0        0.0         0.00        0.00   
16225  SYNCID0000017182         0.0        0.0         0.00        0.00   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
0            0.00    

In [17]:
# Replace NaN values with 0 in specified columns
combined_df['num_unique_card_merchant_categories'].fillna(0, inplace=True)
combined_df['pct_card_ecommerce'].fillna(0, inplace=True)
combined_df['log1p_kyc_employee_count'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['num_unique_card_merchant_categories'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['pct_card_ecommerce'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

In [18]:
print(combined_df[combined_df['customer_id'] == 'SYNCID0000005244'])

           customer_id  abm_credit  abm_debit  card_credit  card_debit  \
4953  SYNCID0000005244     4235.09     886.93     22699.75    79740.08   

      eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
4953     1523.06    3984.26       10109.06       2750.65    31901.87  ...   

      cheque_credit_debit_ratio  eft_credit_debit_ratio  \
4953                   3.675153                0.382269   

      emt_credit_debit_ratio  wire_credit_debit_ratio  abm_debit_credit_ratio  \
4953                0.648714                 0.471693                0.209424   

      card_debit_credit_ratio  cheque_debit_credit_ratio  \
4953                 3.512818                   0.272098   

      eft_debit_credit_ratio  emt_debit_credit_ratio  wire_debit_credit_ratio  
4953                2.615957                 1.54151                 2.120021  

[1 rows x 132 columns]


In [19]:
combined_df.to_csv('combined_df.csv', index=False)

In [20]:


print(list(combined_df.columns))


['customer_id', 'abm_credit', 'abm_debit', 'card_credit', 'card_debit', 'eft_credit', 'eft_debit', 'cheque_credit', 'cheque_debit', 'emt_credit', 'emt_debit', 'wire_credit', 'wire_debit', 'abm_credit_transaction_count', 'abm_debit_transaction_count', 'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit', 'abm_account_active_days', 'card_credit_transaction_count', 'card_debit_transaction_count', 'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit', 'card_account_active_days', 'eft_credit_transaction_count', 'eft_debit_transaction_count', 'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit', 'eft_account_active_days', 'cheque_credit_transaction_count', 'cheque_debit_transaction_count', 'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_de

In [21]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
0      SYNCID0000000000         0.0        0.0         0.00        0.00   
1      SYNCID0000000001         0.0        0.0         0.00      291.39   
2      SYNCID0000000002         0.0        0.0         0.00        0.00   
3      SYNCID0000000004         0.0        0.0      8805.86     7534.12   
4      SYNCID0000000005         0.0        0.0         0.00        0.00   
...                 ...         ...        ...          ...         ...   
16221  SYNCID0000017178         0.0        0.0         0.00        0.00   
16222  SYNCID0000017179         0.0        0.0         0.00        0.00   
16223  SYNCID0000017180         0.0        0.0         0.00        0.00   
16224  SYNCID0000017181         0.0        0.0         0.00        0.00   
16225  SYNCID0000017182         0.0        0.0         0.00        0.00   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
0            0.00    

In [22]:
combined_df.fillna(0, inplace=True)

In [23]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}

# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_transaction_counts
  Optimal number of clusters: 2
Anomalies for group1_transaction_counts: 163

Processing group: group2_transaction_amounts
  Optimal number of clusters: 2
Anomalies for group2_transaction_amounts: 163

Processing group: group3_transaction_timing
  Optimal number of clusters: 20
Anomalies for group3_transaction_timing: 174

Processing group: group4_credit_debit_ratios
  Optimal number of clusters: 3
Anomalies for group4_credit_debit_ratios: 165

Processing group: group5_overall_amount_averages
  Optimal number of clusters: 3
Anomalies for group5_overall_amount_averages: 163

Processing group: group7_log_active_days
  Optimal number of clusters: 20
Anomalies for group7_log_active_days: 170

Total number of unique anomalies across all groups: 788
Unique anomaly customer IDs: ['SYNCID0000013895', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000007656', 'SYNCID0000015917', 'SYNCID0000003382', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000

In [24]:
len(unique_anomalies)

788

In [25]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

variable_groups_ml = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}

# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups_ml.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies_strategy_2 = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_high_value_transactions
  Optimal number of clusters: 2
Anomalies for group1_high_value_transactions: 163

Processing group: group2_frequent_transactions
  Optimal number of clusters: 2
Anomalies for group2_frequent_transactions: 163

Processing group: group3_rapid_velocity
  Optimal number of clusters: 3
Anomalies for group3_rapid_velocity: 163

Processing group: group4_inconsistent_activity
  Optimal number of clusters: 7
Anomalies for group4_inconsistent_activity: 166

Processing group: group5_round_number_transactions
  Optimal number of clusters: 2
Anomalies for group5_round_number_transactions: 163

Processing group: group6_international_transactions
  Optimal number of clusters: 2
Anomalies for group6_international_transactions: 163

Total number of unique anomalies across all groups: 788
Unique anomaly customer IDs: ['SYNCID0000013895', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000007656', 'SYNCID0000015917', 'SYNCID0000003382', 'SYNCID0000003100

In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

variable_groups_v3 = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups_v3.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies_strategy_3 = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group_abm_profile
  Optimal number of clusters: 2
Anomalies for group_abm_profile: 163

Processing group: group_card_profile
  Optimal number of clusters: 2
Anomalies for group_card_profile: 163

Processing group: group_eft_profile
  Optimal number of clusters: 2
Anomalies for group_eft_profile: 163

Processing group: group_cheque_profile
  Optimal number of clusters: 2
Anomalies for group_cheque_profile: 163

Processing group: group_emt_profile
  Optimal number of clusters: 2
Anomalies for group_emt_profile: 163

Processing group: group_wire_profile
  Optimal number of clusters: 3
Anomalies for group_wire_profile: 164

Processing group: group_transaction_intensity
  Optimal number of clusters: 2
Anomalies for group_transaction_intensity: 163

Processing group: group_account_duration
  Optimal number of clusters: 20
Anomalies for group_account_duration: 173

Processing group: group_amount_rolling_ratios
  Optimal number of clusters: 3
Anomalies for group_amount_rolli

In [27]:
len(unique_anomalies)+len(unique_anomalies_strategy_2)+len(unique_anomalies_strategy_3)

3049

In [28]:
anamolies = (unique_anomalies)+(unique_anomalies_strategy_2)+(unique_anomalies_strategy_3)

In [29]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")


Contingency Table (Anomaly Status vs. industry_code):

industry_code     0  0112  0115  0119  0122  0129  0131  0132  0133  0139  \
anomaly_status                                                              
Anomaly          50     4     0     3     2     3     3     0     1     2   
Non-Anomaly     337    57    15    12    29    20    10    11    13    23   

industry_code   ...  9931  9942  9949  9951  9952  9953  9959  9961  9999  \
anomaly_status  ...                                                         
Anomaly         ...     5     6     3     1     1    16     9     4    11   
Non-Anomaly     ...    32    30    35    15    10   126    55    17    83   

industry_code   other  
anomaly_status         
Anomaly           239  
Non-Anomaly      1692  

[2 rows x 253 columns]

Chi-squared statistic: 364.8448
P-value: 0.0000
Degrees of freedom: 252

Significance level (alpha): 0.05
Reject null hypothesis (p < 0.05).
There is a statistically significant association between Anomaly

In [30]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies = list(set(unique_anomalies) & set(unique_anomalies_strategy_2)& set(unique_anomalies_strategy_3))
unique_to_strategy_1 = list(set(unique_anomalies) - set(unique_anomalies_strategy_2)-set(unique_anomalies_strategy_3))
unique_to_strategy_2 = list(set(unique_anomalies_strategy_2) - set(unique_anomalies)-set(unique_anomalies_strategy_3))
unique_to_strategy_3 = list(set(unique_anomalies_strategy_3)-set(unique_anomalies_strategy_2) - set(unique_anomalies))
print(f"Common Anomalies: {common_anomalies}")
print(f"Number of Common Anomalies: {len(common_anomalies)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3)}")


Common Anomalies: ['SYNCID0000001064', 'SYNCID0000000871', 'SYNCID0000006429', 'SYNCID0000015688', 'SYNCID0000007029', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000012732', 'SYNCID0000002736', 'SYNCID0000010770', 'SYNCID0000008299', 'SYNCID0000007031', 'SYNCID0000010138', 'SYNCID0000002396', 'SYNCID0000004994', 'SYNCID0000005538', 'SYNCID0000002923', 'SYNCID0000013984', 'SYNCID0000007082', 'SYNCID0000003966', 'SYNCID0000013957', 'SYNCID0000009260', 'SYNCID0000005967', 'SYNCID0000011150', 'SYNCID0000013392', 'SYNCID0000008051', 'SYNCID0000004446', 'SYNCID0000013129', 'SYNCID0000005809', 'SYNCID0000005779', 'SYNCID0000011167', 'SYNCID0000007519', 'SYNCID0000001070', 'SYNCID0000013011', 'SYNCID0000008509', 'SYNCID0000005730', 'SYNCID0000000545', 'SYNCID0000001889', 'SYNCID0000015466', 'SYNCID0000005498', 'SYNCID0000014308', 'SYNCID0000011409', 'SYNCID0000005244', 'SYNCID0000005293', 'SYNCID0000002220', 'SYNCID0000015228', 'SYNCID0000002810', 'SYNCID0000004428', 'SYNCID0000007646', '

In [31]:
len(common_anomalies)+len(unique_to_strategy_1)+len(unique_to_strategy_2)+len(unique_to_strategy_3)

1525

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_transaction_counts
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group1_transaction_counts using Birch: 164

Processing group: group2_transaction_amounts
  Best Birch Parameters: {'branching_factor': 10, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group2_transaction_amounts using Birch: 164

Processing group: group3_transaction_timing


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch_2 = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()


variable_groups = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch_3 = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")

In [None]:
len(unique_anomalies_birch)+len(unique_anomalies_birch_2)+len(unique_anomalies_birch_3)

In [None]:
anamolies_birch = (unique_anomalies_birch)+(unique_anomalies_birch_2)+(unique_anomalies_birch_3)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies_birch else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")

In [None]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies_birch = list(set(unique_anomalies_birch) & set(unique_anomalies_birch_2)& set(unique_anomalies_birch_3))
unique_to_strategy_1_birch = list(set(unique_anomalies_birch) - set(unique_anomalies_birch_2)-set(unique_anomalies_birch_3))
unique_to_strategy_2_birch = list(set(unique_anomalies_birch_2) - set(unique_anomalies_birch)-set(unique_anomalies_birch_3))
unique_to_strategy_3_birch = list(set(unique_anomalies_birch_3)-set(unique_anomalies_birch_2) - set(unique_anomalies_birch))
print(f"Common Anomalies: {common_anomalies_birch}")
print(f"Number of Common Anomalies: {len(common_anomalies_birch)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_birch}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_birch)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_birch}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_birch)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_birch}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_birch)}")


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae, all_group_anomalies_ae = detect_anomalies_autoencoder(df, variable_groups)


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae_2, all_group_anomalies_ae_2 = detect_anomalies_autoencoder(df, variable_groups)


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae_3, all_group_anomalies_ae_3 = detect_anomalies_autoencoder(df, variable_groups)


In [None]:
len(unique_anomalies_ae)+len(unique_anomalies_ae_2)+len(unique_anomalies_ae_3)

In [None]:
anamolies_ae = (unique_anomalies_ae)+(unique_anomalies_ae_2)+(unique_anomalies_ae_3)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies_ae else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")

In [None]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies_ae = list(set(unique_anomalies_ae) & set(unique_anomalies_ae_2)& set(unique_anomalies_ae_3))
unique_to_strategy_1_ae = list(set(unique_anomalies_ae) - set(unique_anomalies_ae_2)-set(unique_anomalies_ae_3))
unique_to_strategy_2_ae = list(set(unique_anomalies_ae_2) - set(unique_anomalies_ae)-set(unique_anomalies_ae_3))
unique_to_strategy_3_ae = list(set(unique_anomalies_ae_3)-set(unique_anomalies_ae_2) - set(unique_anomalies_ae))
print(f"Common Anomalies: {common_anomalies_ae}")
print(f"Number of Common Anomalies: {len(common_anomalies_ae)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_ae}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_ae)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_ae}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_ae)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_ae}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_ae)}")


In [None]:
all_anomolies=[]

In [None]:
all_anomolies=common_anomalies_ae+unique_to_strategy_1_ae+unique_to_strategy_2_ae+unique_to_strategy_3_ae+common_anomalies_birch+unique_to_strategy_1_birch+unique_to_strategy_2_birch+unique_to_strategy_3_birch+common_anomalies+unique_to_strategy_1+unique_to_strategy_2+unique_to_strategy_3

In [None]:
kmeans_anomolies=common_anomalies+unique_to_strategy_1+unique_to_strategy_2+unique_to_strategy_3

In [None]:
birch_anomalies=common_anomalies_birch+unique_to_strategy_1_birch+unique_to_strategy_2_birch+unique_to_strategy_3_birch

In [None]:
ae_anomolies=common_anomalies_ae+unique_to_strategy_1_ae+unique_to_strategy_2_ae+unique_to_strategy_3_ae

In [None]:
print(all_anomolies)

In [None]:
len(all_anomolies)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in all_anomolies else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")

In [None]:
common_anomalies_final = list(set(kmeans_anomolies) & set(birch_anomalies)& set(ae_anomolies))
unique_to_strategy_1_final = list(set(kmeans_anomolies) - set(birch_anomalies)-set(ae_anomolies))
unique_to_strategy_2_final = list(set(birch_anomalies) - set(kmeans_anomolies)-set(ae_anomolies))
unique_to_strategy_3_final = list(set(ae_anomolies)-set(birch_anomalies) - set(kmeans_anomolies))
print(f"Common Anomalies: {common_anomalies_ae}")
print(f"Number of Common Anomalies: {len(common_anomalies_final)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_final}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_final)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_final}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_final)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_final}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_final)}")


In [None]:
final_anomolies=common_anomalies_final+unique_to_strategy_1_final+unique_to_strategy_2_final+unique_to_strategy_3_final

In [None]:
print(len(final_anomolies))

In [None]:
# Create a DataFrame from the list
df_anomalies = pd.DataFrame({'customer_id': final_anomolies})

# Save the DataFrame to a CSV file
df_anomalies.to_csv('final_anomalies.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}


In [None]:
class Autoencoder(nn.Module): # Autoencoder class - keep it outside for reusability
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim), # Output dimension should match input dimension
            nn.ReLU() #added a relu layer here as well
            # Output layer - adjust activation if needed
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded # Return decoded output for reconstruction loss

    def get_embedding(self, x): # Function to get embeddings
        return self.encoder(x)

In [None]:
# --- Function to Get Autoencoder Embeddings ---
def get_autoencoder_embeddings(df, variable_groups):
    all_group_embeddings = {}
    trained_models = {} # Store trained autoencoder models for later embedding extraction

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder Embeddings: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2)
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder (same training as before)
        for epoch in range(50):
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()

        trained_models[group_name] = model_ae # Store trained model

        # Get embeddings
        with torch.no_grad():
            embeddings_tensor = model_ae.get_embedding(X_tensor) # Use get_embedding function
            embeddings = embeddings_tensor.numpy() # Convert to numpy array

        all_group_embeddings[group_name] = pd.DataFrame(embeddings, index=df_group['customer_id'], columns=[f'emb_{i}' for i in range(hidden_dim)]) # Store embeddings as DataFrame with customer_ids

    return all_group_embeddings, trained_models

In [None]:
# --- Custom Birch Clustering class and silhouette scorer (from previous code, keep it) ---
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)


def detect_anomalies_clustering_embeddings(all_group_embeddings, clustering_method='KMeans'): # Generic function for clustering on embeddings
    all_group_anomalies_embedding_cluster = {}

    for group_name, embeddings_df in all_group_embeddings.items():
        print(f"\nProcessing group for {clustering_method} Clustering on Embeddings: {group_name}")
        data_scaled_embeddings = embeddings_df.copy() # Embeddings are already scaled by AE, no need to scale again

        silhouette_scores = []
        cluster_range = range(2, 11) # Reduced cluster range for embeddings (experiment)

        #Added this to bypass the silhouette score computation if only one cluster is detected.
        #This is likely to happen due to Birch being more sensitive to density and potentially finding a single cluster in the embedding space.
        num_clusters_found = len(set(BirchClustering(n_clusters=None).fit_predict(data_scaled_embeddings)))
        if num_clusters_found < 2 and clustering_method == 'Birch':
            print(f"Only 1 cluster found by Birch. Skipping silhouette score computation and using n_clusters=2")
            optimal_k = 2
        else: #Proceed with normal silhouette analysis if more than one cluster is found.
            for k in cluster_range:
                if clustering_method == 'KMeans':
                    model = KMeans(n_clusters=k, n_init=10, random_state=1, max_iter=300) # Reduced n_init and max_iter for speed
                elif clustering_method == 'Birch':
                    model = BirchClustering(n_clusters=k) # Use Birch directly with n_clusters
                else:
                    raise ValueError("Invalid clustering_method. Choose 'KMeans' or 'Birch'.")

                cluster_labels = model.fit_predict(data_scaled_embeddings)
                silhouette_avg = silhouette_score(data_scaled_embeddings, cluster_labels)
                silhouette_scores.append(silhouette_avg)

            optimal_k = cluster_range[np.argmax(silhouette_scores)]
            print(f"  Optimal number of clusters (Embeddings - {clustering_method}): {optimal_k}")

        if clustering_method == 'KMeans':
            model = KMeans(n_clusters=optimal_k, n_init=10, random_state=4, max_iter=300)
        elif clustering_method == 'Birch':
            model = BirchClustering(n_clusters=optimal_k)

        cluster_numbers = model.fit_predict(data_scaled_embeddings)
        embeddings_df['Cluster'] = cluster_numbers # Add cluster labels to embeddings dataframe

        group_anomalies_embedding_cluster = [] # Anomaly list for this group and clustering method
        unique_clusters = embeddings_df['Cluster'].unique()
        for i in unique_clusters:
            cluster_data = embeddings_df[embeddings_df['Cluster'] == i].drop(columns=['Cluster'])
            if cluster_data.empty:
                print(f"  Cluster {i} is empty. Skipping.")
                continue
            clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=100) # Reduced n_estimators for speed
            clf.fit(cluster_data) # Fit IsolationForest on embeddings
            y_pred = clf.predict(cluster_data)
            cluster_anomalies = cluster_data.index[y_pred == -1].tolist() # Get customer IDs from index
            group_anomalies_embedding_cluster.extend(cluster_anomalies)

        all_group_anomalies_embedding_cluster[group_name] = group_anomalies_embedding_cluster
        print(f"Anomalies for {group_name} using {clustering_method} on Embeddings: {len(group_anomalies_embedding_cluster)}")

    return all_group_anomalies_embedding_cluster

In [None]:
# --- 1. Get Autoencoder Embeddings ---
all_group_embeddings_ae, trained_models_ae = get_autoencoder_embeddings(df, variable_groups)

# --- 2. Anomaly Detection using K-Means on Embeddings ---
all_group_anomalies_kmeans_embeddings = detect_anomalies_clustering_embeddings(all_group_embeddings_ae, clustering_method='KMeans')

# --- 3. Anomaly Detection using Birch on Embeddings ---
all_group_anomalies_birch_embeddings = detect_anomalies_clustering_embeddings(all_group_embeddings_ae, clustering_method='Birch')

In [None]:
print(len(all_group_anomalies_kmeans_embeddings))


In [None]:

# Combine unique values from all_group_anomalies_kmeans_embeddings into a single list
all_anomalies_list_kmeans_embeddings = []
for group_name, anomalies in all_group_anomalies_kmeans_embeddings.items():
    all_anomalies_list_kmeans_embeddings.extend(anomalies)

unique_anomalies_list_kmeans_embeddings = list(set(all_anomalies_list_kmeans_embeddings))
print(f"Total number of unique anomalies across groups: {len(unique_anomalies_list_kmeans_embeddings)}")
print(unique_anomalies_list_kmeans_embeddings)


In [None]:
print(all_group_anomalies_birch_embeddings)

In [None]:
# Combine unique values from all_group_anomalies_kmeans_embeddings into a single list
all_anomalies_list_birch_embeddings = []
for group_name, anomalies in all_group_anomalies_birch_embeddings.items():
    all_anomalies_list_birch_embeddings.extend(anomalies)

unique_anomalies_list_birch_embeddings = list(set(all_anomalies_list_birch_embeddings))
print(f"Total number of unique anomalies across groups: {len(unique_anomalies_list_birch_embeddings)}")
print(unique_anomalies_list_birch_embeddings)

In [None]:
embedding_clustering_anomalies = list(set(unique_anomalies_list_kmeans_embeddings) | set(unique_anomalies_list_birch_embeddings))

In [None]:
len(embedding_clustering_anomalies)

In [None]:
common_anomalies_final_after_embedding = list(set(final_anomolies) & set(embedding_clustering_anomalies))
unique_to_strategy_1_final_after_embedding = list(set(final_anomolies) - set(embedding_clustering_anomalies))
unique_to_strategy_2_final_after_embedding = list(set(embedding_clustering_anomalies) - set(final_anomolies))
print(f"Common Anomalies: {common_anomalies_final_after_embedding}")
print(f"Number of Common Anomalies: {len(common_anomalies_final_after_embedding)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_final_after_embedding}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_final_after_embedding)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_final_after_embedding}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_final_after_embedding)}")

In [None]:
anamolies_after_embedding=common_anomalies_final_after_embedding+unique_to_strategy_1_final_after_embedding+unique_to_strategy_2_final_after_embedding

In [None]:
import pandas as pd

# Assuming anamolies_after_embedding is your list of anomaly IDs
anamolies_after_embedding_df = pd.DataFrame(anamolies_after_embedding, columns=['customer_id'])
# Create a DataFrame with the list

anamolies_after_embedding_df.to_csv('final_anomalies_after_embedding.csv', index=False)
# Save the DataFrame to CSV