# Summary.
--------------------------

This notebook helps to analyse the dataset priori the llm training for fraud detection

In [None]:
%load_ext autoreload
%autoreload 2
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
import seaborn as sns
# Set Seaborn style (optional)
sns.set(style="whitegrid")


In [None]:
CONFIG = {
    "dataset_version":"v4",
    "fold_id":1 #FOLD_ID does not matter because the union of train and test will give us the entire dataset
}

# EXTENDED FIN DATASET

In [None]:
from researchpkg.anomaly_detection.config import FINANCIALS_DIR_EXTENDED, PREPROCESSED_PATH_EXTENDED, SEED_TRAINING
from researchpkg.anomaly_detection.models.utils import load_cross_validation_path
from researchpkg.anomaly_detection.preprocessing.extended.sec_financial_preprocessing_quarterly_extended import EXTENDED_FINANCIAL_FEATURES, EXTENDED_FINANCIAL_FEATURES_COUNT_COLS
import pandas as pd

FULL_FINANCIAL_PATH = (
    FINANCIALS_DIR_EXTENDED / "sec_financials_quarterly.csv"
)

def load_data( train_path=None, test_path=None):
    """
    Load train and test datasets, merging with MDA and financial data.

    Args:
        train_path (Path, optional): Path to training data.
        test_path (Path, optional): Path to test data.

    Returns:
        tuple: (train_df, val_df, test_df)
    """

    def merge_with_financials(df, full_df):
        """Merges the financial data with the given DataFrame."""
        df = df.drop(columns=EXTENDED_FINANCIAL_FEATURES_COUNT_COLS, errors="ignore")
        df = df.merge(full_df, on=["cik", "year", "quarter"], how="left")
        return df

    train_path,test_path = load_cross_validation_path(CONFIG) 

    
    full_df = pd.read_csv(FULL_FINANCIAL_PATH)
    full_df = full_df[["cik", "year", "quarter"] + EXTENDED_FINANCIAL_FEATURES]

    train_df = pd.read_csv(train_path)
    train_df = merge_with_financials(train_df, full_df)
    test_df = pd.read_csv(test_path)
    test_df = merge_with_financials(test_df, full_df)

    del full_df  # Clean up memory

    return train_df, test_df


In [None]:
train_df, test_df = load_data()

In [None]:
train_df

In [None]:
import os
from transformers import AutoTokenizer
os.environ['TOKENIZERS_PARALLELISM']='1'
model= "unsloth/Llama-3.1-8B-Instruct-unsloth-bnb-4bit"
TOKENIZER = AutoTokenizer.from_pretrained(model)

In [None]:
from researchpkg.anomaly_detection.config import PREPROCESSED_PATH,MDA_DATASET_PATH

USE_RAW_MDA=True

if USE_RAW_MDA:
    MDA_PATH = MDA_DATASET_PATH /  "quarterly"
else:
    MDA_PATH = PREPROCESSED_PATH / "SEC_MDA_SUMMARIZED" / "quarterly"

In [None]:

import re
from researchpkg.anomaly_detection.models.utils import drop_random_keys
from researchpkg.anomaly_detection.preprocessing.extended.sec_financial_preprocessing_quarterly_extended import AGGREGATE_FEATURES, BENEISH_PROBM, DIFF_FEATURES, EXTENDED_FEATURES_SHORT_DESCRIPTION_DICT, EXTENDED_FINANCIAL_FEATURES, EXTENDED_FEATURES_DESCRIPTION_DICT, IMPORTANT_TAGS, RATIO_FEATURES
from researchpkg.anomaly_detection.preprocessing.utils import clean_mda_content

def load_mda_content(mda_quarter_id):
        """
        Load the content of a MDA FIle.
        """

        mda_file = MDA_PATH / f"{mda_quarter_id}.txt"

        if not mda_file.exists():
            raise FileNotFoundError(f"MDA file {mda_file} does not exist.")

        with open(mda_file, "r", encoding="utf-8") as file:
            return file.read()



def truncate_prompt(mda_text: str, financials_str:str)-> str:
        
        # Truncate the part 2 of the prompt
        prompt_tokens_part2 = TOKENIZER(
            financials_str,
            return_tensors="pt",
            padding =False,
        )
        
        truncated_part2 = TOKENIZER.decode(
            prompt_tokens_part2["input_ids"][0],
            skip_special_tokens=True,
        )
        
        
        # First tokenize and truncate the prompt (without the label)
        prompt_tokens_part1 = TOKENIZER(
            mda_text,
            return_tensors="pt",
            truncation=False
            
        )
        
        # Decode truncated prompt back to text
        truncated_part1 = TOKENIZER.decode(
            prompt_tokens_part1["input_ids"][0],
            skip_special_tokens=True)
        
        return truncated_part1, truncated_part2

EXCLUDED_FINANCIALS_FEATURES = set([
    BENEISH_PROBM # Too much biasing the model as it a probability of earnings manipulation
])


CURRENCY_FEATURES = set (AGGREGATE_FEATURES+DIFF_FEATURES+IMPORTANT_TAGS)
def is_with_currency(feature):
    return feature in CURRENCY_FEATURES

PERCENTAGE_FEATURES = set(RATIO_FEATURES)


def format_financials(financials, drop_rate=0):
        """
        Format financial data dictionary into a string for the prompt.
        Handles dropping features, formatting numbers, and adding units.

        Args:
            financials (dict): Dictionary of financial features {feature_name: value}.
            drop_rate (float): Probability (0 to 1) of dropping each feature during formatting.

        Returns:
            str: Formatted string representation of the financials.
        """

        def display_financial_value(value):
            """Formats financial values for display."""
            if pd.isna(value):
                return "N/A" # Handle missing values explicitly
            try:
                value = float(value)
                if value == 0:
                    return "0"
                elif abs(value) < 0.01 and abs(value) > 0: # Small non-zero values
                    return f"{value:.2e}"
                elif abs(value) < 10:
                    return f"{value:.2f}"
                else:
                    # Format with commas, no decimal places for large numbers
                    return "{:,.0f}".format(value)
            except (ValueError, TypeError):
                return str(value) # Return as string if not convertible to float

        # Filter out excluded features and invalid values (NaN, Inf)
        processed_financials = {}
        for k, v in financials.items():
            if k not in EXCLUDED_FINANCIALS_FEATURES and pd.notna(v) and np.isfinite(v) and v != 0:
                 processed_financials[k] = v


        # Apply feature dropout if requested
        if drop_rate > 0:
            processed_financials = drop_random_keys(processed_financials, drop_rate)

        # Format the remaining features into strings
        financial_lines = []
        # Sort for consistency (optional)
        sorted_keys = sorted(processed_financials.keys())

        for key in sorted_keys:
            value = processed_financials[key]
            description = EXTENDED_FEATURES_SHORT_DESCRIPTION_DICT.get(key, key) # Use key if description missing
            unit = ""
            formatted_value = value # Start with original value

            # Apply specific formatting based on feature type
            if key in PERCENTAGE_FEATURES:
                unit = "%"
                formatted_value = value * 100 # Convert ratio to percentage
            elif is_with_currency(key):
                unit = "$" # Assume USD, adjust if needed

            # Format the number using the helper function
            display_value = display_financial_value(formatted_value)

            # Add unit prefix/suffix
            if unit == "$":
                display_str = f"{unit}{display_value}"
            elif unit == "%":
                 display_str = f"{display_value}{unit}"
            else:
                 display_str = display_value # No unit


            financial_lines.append(f"- {description}: {display_str}")

        return "\n".join(financial_lines) if financial_lines else "No financial data available."




def get_sample_data(row):
    """
    Returns a sample of the training data for testing purposes.
    And return : 
    - {
        
        "prompt_part1:'"The content of the first part of the prompt, which includes the MDA text."
        "prompt_part2: "The content of the second part of the prompt, which includes the financials."
        "full_text":
        
        "prompt_part1_tokens_count":
        "prompt_part2_tokens_count":
        "full_text_tokens_count":
        
        
        "prompt_part1_truncated":
        "prompt_part2_truncated":
        "full_text_truncated":
        
        "prompt_part1_truncated_tokens_count":
        "prompt_part2_truncated_tokens_count":
        "full_text_truncated_tokens_count":
    }
    """
    
    # Get the MDA quarter ID
    mda_quarter_id = row["mda_quarter_id"]
    mda_content = load_mda_content(mda_quarter_id)
    label = "Fraud" if row["is_fraud"] else "Not Fraud"

    
    # Get the financials for the current row
    # Convert the financials dictionary to a string format
    financials = row[EXTENDED_FINANCIAL_FEATURES].to_dict()
    financials_str = format_financials(financials)
    
    
    #Compute the tokens count
    
    full_text = mda_content + financials_str
    full_text_tokens_count = len(TOKENIZER(full_text)["input_ids"])
    
    prompt_part1 = mda_content
    prompt_part1_tokens_count = len(TOKENIZER(prompt_part1)["input_ids"])
    
    prompt_part2 = financials_str
    prompt_part2_tokens_count = len(TOKENIZER(prompt_part2)["input_ids"])
    
    
    truncated_part1, truncated_part2 = truncate_prompt(mda_content, financials_str)
    prompt_part1_truncated_tokens_count = len(TOKENIZER(truncated_part1)["input_ids"])
    prompt_part2_truncated_tokens_count = len(TOKENIZER(truncated_part2)["input_ids"])
    
    # Truncate the full text
    full_text_truncated = truncated_part1 + truncated_part2 + "<start_of_turn>model\n" + label
    
    full_text_truncated_tokens_count = len(TOKENIZER(full_text_truncated)["input_ids"])
    # Return the sample data
    
    return {
        "prompt_part1": prompt_part1,
        "prompt_part2": prompt_part2,
        "full_text": full_text,
        
        "prompt_part1_tokens_count": prompt_part1_tokens_count,
        "prompt_part2_tokens_count": prompt_part2_tokens_count,
        "full_text_tokens_count": full_text_tokens_count,
        
        
        "prompt_part1_truncated": truncated_part1,
        "prompt_part2_truncated": truncated_part2,
        "full_text_truncated": full_text_truncated,
        
        "prompt_part1_truncated_tokens_count": prompt_part1_truncated_tokens_count,
        "prompt_part2_truncated_tokens_count": prompt_part2_truncated_tokens_count,
        "full_text_truncated_tokens_count": full_text_truncated_tokens_count
    }
    

## Dataframe with the entire dataset

In [None]:
import pandas as pd
from multiprocessing import Pool, cpu_count
import numpy as np  # Import numpy


def process_row(row, subset):
    """Processes a single row and returns a dictionary of token statistics."""
    # Get the sample data
    sample_data = get_sample_data(row)

    # Create a dictionary with the results
    results = {
        "cik": row["cik"],
        "year": row["year"],
        "quarter": row["quarter"],
        "is_fraud": row["is_fraud"],
        "mda_quarter_id": row["mda_quarter_id"],
        "subset": subset,
        **sample_data
    }
    return results


def process_partition(partition, subset):
    """Processes a chunk of the DataFrame."""
    import tqdm.notebook as tqdm
    results = []
    for index, row in tqdm.tqdm(partition.iterrows(), desc=f"Processing {subset} partition", total=len(partition)):
        results.append(process_row(row, subset))
    return results


def parallel_process_dataframe(df, subset, num_processes=None):
    """
    Parallelizes the processing of a single DataFrame by partitioning it.

    Args:
        df: The DataFrame to process.
        subset: The subset name ("train" or "test").
        num_processes: Number of processes to use. If None, uses the number of CPU cores.

    Returns:
        A DataFrame containing the combined results.
    """
    if num_processes is None:
        num_processes = cpu_count()

    # Split the DataFrame into partitions
    partitions = np.array_split(df, num_processes)

    with Pool(num_processes) as pool:
        results = pool.starmap(process_partition, [(partition, subset) for partition in partitions])

    # Combine the results
    all_results = []
    for result_list in results:
        all_results.extend(result_list)

    return pd.DataFrame(all_results)


def parallel_process_data(train_df, test_df, num_processes=None):
    """
    Processes train_df and test_df separately, each in parallel.

    Args:
        train_df: DataFrame containing training data.
        test_df: DataFrame containing testing data.
        num_processes: Number of processes to use. If None, uses the number of CPU cores.

    Returns:
        DataFrame containing the combined token statistics for train and test data.
    """
    if num_processes is None:
        num_processes = cpu_count()

    train_results_df = parallel_process_dataframe(train_df, "train", num_processes)
    test_results_df = parallel_process_dataframe(test_df, "test", num_processes)

    df_tokens_stats = pd.concat([train_results_df, test_results_df], ignore_index=True)
    return df_tokens_stats

df_tokens_stats = parallel_process_data(train_df, test_df, num_processes=cpu_count()-2)
df_tokens_stats

In [None]:
df_tokens_stats

In [None]:
df_tokens_stats["prompt_part1_tokens_count"].describe()

In [None]:
sns.set_style("whitegrid") 

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 6))
sns.histplot(data=df_tokens_stats, x="prompt_part1_tokens_count", bins=50, kde=True, color="purple")
plt.xticks(fontsize=18)
plt.xlabel("Number of Tokens in raw quarterly MD&A sections",fontsize=18)
plt.ylabel("Frequency",fontsize=18) 
plt.yticks(fontsize=18)


In [None]:
plt.figure(figsize=(20, 6))
sns.histplot(data=df_tokens_stats, x="prompt_part2_tokens_count", bins=50, kde=True, color="darkblue")
plt.xticks(fontsize=18)
plt.xlabel("Number of Tokens of financial features_prompts",fontsize=18)
plt.ylabel("Frequency",fontsize=18) 
plt.yticks(fontsize=18)


In [None]:
df_tokens_stats["prompt_part2_tokens_count"].describe()

In [None]:
if USE_RAW_MDA:
    df_tokens_stats.to_csv(PREPROCESSED_PATH / "EXTENDED/mda_raw_token_stats.csv", index=False)
else:
    df_tokens_stats.to_csv(PREPROCESSED_PATH / "EXTENDED/mda_summarized_token_stats.csv", index=False)

# Plotting the tokens stats 

In [None]:
df_tokens_stats["full_text_tokens_count"].describe()

In [None]:
df_tokens_stats["full_text_truncated_tokens_count"].describe()

# Out of scope

In [None]:
longest_test = df_tokens_stats.sort_values(["prompt_part1_tokens_count"], ascending=False).reset_index(drop=True)["full_text"][0]
longest_test

## Industry sectors distribution the final dataset

In [None]:
import yaml
index_file= PREPROCESSED_PATH_EXTENDED/"v4/global_stats.yaml"
dataset_config = yaml.load(open(index_file),Loader=yaml.Loader)

In [None]:
dataset_config

In [None]:
per_sector_distribution  = dataset_config["sic_distribution"]

In [None]:
per_sector_distribution

In [None]:
global_raw = per_sector_distribution['global']
fraud_raw = per_sector_distribution['fraud']
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import textwrap

wrapper = textwrap.TextWrapper(width=14)
do_wrap = lambda x : "\n".join(wrapper.wrap(x))
global_= dict(sorted(global_raw.items(),key=lambda x:x[1],reverse=True))
global_ = {do_wrap(k):v for k,v in global_.items()}


fraud_ = {do_wrap(k):v for k,v in fraud_raw.items()}
fraud_= {(k):fraud_[k] for k in global_ if k in fraud_}

# Filter only sectors present in fraud
sectors = list(fraud_.keys())

print("sectors", fraud_)



fraud_counts = [fraud_[sector] for sector in sectors]
global_counts = [global_[sector] for sector in sectors]
fraud_percentages = [f"{fraud_[sector]/global_[sector]*100:.1f}%" for sector in sectors]



# Plot
plt.figure(figsize=(13, 5))
palette = sns.color_palette("hls", len(sectors))
bars = sns.barplot(x=sectors, y=global_counts, palette=palette)

# Add annotations
for bar, count, fraud_num, pct in zip(bars.patches, global_counts, fraud_counts, fraud_percentages):
    bar_x = bar.get_x() + bar.get_width() / 2
    bar_y = bar.get_height()
    y_pos  =bar_y/1.5 
    
    if fraud_num==1:
        y_pos = bar_y*4.5
    elif fraud_num==16:
        y_pos = bar_y*2    
    
    
    label = f"{fraud_num} F.\n({pct})"
    plt.text(bar_x, y_pos, label, ha='center', va='center', fontsize=20, color='black', fontfamily="Nimbus Sans" ,fontweight=600)

plt.ylabel("")
plt.xticks(rotation=0, ha='center',fontsize=18, fontfamily="Nimbus Sans", fontweight=600)
plt.yticks(fontsize=20, fontfamily="Nimbus Sans", fontweight=600)
# plt.title("Fraud cases per sector (count and percentage over total)")
plt.tight_layout()
plt.show()