# Flagging Suspicious Terms in Financial Communications

This notebook demonstrates how to create a fraud dictionary and flag documents containing suspicious terms.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom module
import sys
sys.path.append('../src/')
from fraud_detector import FraudDetector

## 1. Create Sample Financial Communications

In [None]:
# Create sample communications
communications = [
    "The quarterly results are looking positive. Revenue is up 15%.",
    "Need to sell company stock immediately before the announcement.",
    "Your stock options will vest next month as scheduled.",
    "Please review the budget projections for Q3.",
    "Confidential: Our stock will likely drop after the poor earnings report.",
    "The merger with XYZ Corp is proceeding as planned.",
    "We should unload these stock options before the audit results.",
    "Your bonus structure has been updated in the system.",
    "The stock price has been stable for the past quarter.",
    "Internal memo: We need to hide these losses from the investors."
]

# Create DataFrame
df = pd.DataFrame({'content': communications})
df

## 2. Define Fraud Dictionary

In [None]:
# Create a list of suspicious terms
suspicious_terms = [
    'sell stock',
    'unload stock',
    'hide losses',
    'before announcement',
    'sell immediately',
    'confidential',
    'stock will drop',
    'poor earnings'
]

print(f"Our fraud dictionary contains {len(suspicious_terms)} terms.")
print("These terms might indicate insider trading or financial fraud:")
for term in suspicious_terms:
    print(f"- {term}")

## 3. Flag Communications with Suspicious Terms

In [None]:
# Initialize our fraud detector
detector = FraudDetector()
detector.set_fraud_dictionary(suspicious_terms)

# Flag suspicious content
flagged_df = detector.flag_suspicious_content(df.copy(), 'content')

# Display results
pd.set_option('display.max_colwidth', None)
flagged_df

## 4. Analyze Flagged Communications

In [None]:
# Get only suspicious content
suspicious_content = detector.get_suspicious_content(flagged_df)

# Display suspicious content
print(f"Found {len(suspicious_content)} suspicious communications out of {len(df)} total.")
suspicious_content

## 5. Visualize Results

In [None]:
# Count flagged vs non-flagged
flag_counts = flagged_df['flag'].value_counts()

# Create pie chart
plt.figure(figsize=(8, 6))
plt.pie(flag_counts, labels=['Normal', 'Suspicious'] if 0 in flag_counts.index else ['Suspicious', 'Normal'],
        autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#ff9999'])
plt.title('Distribution of Suspicious vs. Normal Communications')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

## 6. Term-Specific Analysis

In [None]:
# Check which terms triggered flags
def find_matching_terms(text, term_list):
    matches = []
    for term in term_list:
        if term.lower() in text.lower():
            matches.append(term)
    return matches if matches else None

# Apply to suspicious content
suspicious_content['matching_terms'] = suspicious_content['content'].apply(
    lambda x: find_matching_terms(x, suspicious_terms)
)

# Display results
suspicious_content[['content', 'matching_terms']]

## 7. Summary

In [None]:
print("Summary of Fraud Detection Results:")
print(f"- Total communications analyzed: {len(df)}")
print(f"- Suspicious communications detected: {len(suspicious_content)}")
print(f"- Detection rate: {len(suspicious_content)/len(df)*100:.1f}%")
print("\nNext steps would include:")
print("1. Refining the fraud dictionary with more specific terms")
print("2. Applying topic modeling to find hidden patterns")
print("3. Investigating the flagged communications in detail")