# Text Preprocessing for Fraud Detection

This notebook demonstrates text cleaning techniques for NLP tasks in fraud detection.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

## 1. Create Sample Financial Email Data

In [None]:
# Create sample financial email data
emails = [
    "Meeting scheduled regarding Q3 financial results. Please bring all documentation.",
    "The stock options will vest next month. Remember to check your account.",
    "We need to sell company stock before the quarterly report is released.",
    "Please review the attached financial projections for the board meeting.",
    "Confidential: Company XYZ stock will drop after the announcement tomorrow."
]

# Create a dataframe
df = pd.DataFrame({'content': emails})
df

## 2. Define Stopwords and Punctuation

In [None]:
# Get English stopwords
stop = set(stopwords.words('english'))

# Add custom financial stopwords
custom_stopwords = ['please', 'regarding', 'attached', 'meeting', 'scheduled']
stop.update(custom_stopwords)

# Get punctuation
exclude = set(string.punctuation)

# Initialize lemmatizer
lemma = WordNetLemmatizer()

## 3. Create Text Cleaning Function

In [None]:
def clean_text(text, stop_words, exclude_chars, lemmatizer):
    """Clean text by removing stopwords, punctuation, and lemmatizing."""
    # Strip whitespace
    text = text.strip()
    
    # Convert to lowercase and split
    words = text.lower().split()
    
    # Remove stopwords and digits
    stop_free = [word for word in words if (word not in stop_words) and (not word.isdigit())]
    
    # Remove punctuation
    punc_free = [word for word in stop_free if word not in exclude_chars]
    
    # Lemmatize words
    normalized = [lemmatizer.lemmatize(word) for word in punc_free]
    
    # Join words back together
    return " ".join(normalized)

## 4. Apply Cleaning to Sample Data

In [None]:
# Apply cleaning function to each email
df['clean_content'] = df['content'].apply(lambda x: clean_text(x, stop, exclude, lemma))

# Display results
pd.set_option('display.max_colwidth', None)
df[['content', 'clean_content']]

## 5. Analyzing Cleaning Results

In [None]:
# Count words before and after cleaning
df['original_word_count'] = df['content'].apply(lambda x: len(x.split()))
df['cleaned_word_count'] = df['clean_content'].apply(lambda x: len(x.split()))

# Calculate percentage of words removed
df['pct_removed'] = (df['original_word_count'] - df['cleaned_word_count']) / df['original_word_count'] * 100

df[['original_word_count', 'cleaned_word_count', 'pct_removed']]

## 6. Tokenizing Cleaned Text

In [None]:
# Tokenize cleaned text
df['tokens'] = df['clean_content'].apply(lambda x: x.split())

# Display tokens
df[['clean_content', 'tokens']]

## 7. Save Processed Data for Next Steps

In [None]:
# Let's see what we've created
print(f"We've cleaned {len(df)} emails and reduced the word count by {df['pct_removed'].mean():.1f}% on average.")
print("The cleaned text is now ready for further analysis.")