**PROJECT ARCHITECTURE**

faiss_indexer.py: FAISS index creation and management.

eml_extractor.py: Functions to extract EML content.

classifier.py: LLM classification logic.

main.py: The main script for querying and displaying results.

eml_extractor.py       # EML extraction functions


In [None]:
import os
from email import message_from_file

def extract_eml_content(file_path):
    """Extracts subject and body text from an EML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        msg = message_from_file(f)

    subject = msg.get('Subject', '')
    body = ""

    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
    else:
        body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')

    return f"Subject: {subject}\n\n{body}"


faiss_indexer.py       # FAISS indexing and search functions


In [None]:
!pip install faiss-cpu



In [None]:
import os
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# === Configuration ===
FAISS_INDEX_FILE = "/content/FAISS/faiss_index_classifications.idx"
EML_CSV_FILE = "/content/eml_classification_mapping_cleaned.csv"
EMBED_MODEL = SentenceTransformer('all-MiniLM-L6-v2')


# ✅ Function to create FAISS index if it doesn't exist
def create_faiss_index(eml_df):
    """Creates and saves a FAISS index from EML classification CSV."""
    print("\n🛠️ Creating FAISS index...")

    try:
        # ✅ Extract email contents and create embeddings
        emails = eml_df['File'].tolist()
        embeddings = []

        for eml_file in emails:
            eml_path = f"/content/synthetic_eml_files/{eml_file}"

            # Skip if the file doesn't exist
            if not os.path.exists(eml_path):
                print(f"❌ Skipping {eml_file} (file not found)")
                continue

            eml_content = extract_eml_content(eml_path)
            embedding = EMBED_MODEL.encode(eml_content)
            embeddings.append(embedding)

        # ✅ Convert embeddings to FAISS format
        embeddings = np.array(embeddings).astype('float32')
        dimension = embeddings.shape[1]

        # ✅ Create and save the FAISS index
        faiss_index = faiss.IndexFlatL2(dimension)
        faiss_index.add(embeddings)

        os.makedirs(os.path.dirname(FAISS_INDEX_FILE), exist_ok=True)
        faiss.write_index(faiss_index, FAISS_INDEX_FILE)

        print("\n✅ FAISS index created and saved successfully!")
        return faiss_index

    except Exception as e:
        print(f"❌ Error creating FAISS index: {e}")
        return None


# ✅ Function to load FAISS index (or create it)
def load_faiss():
    """Loads FAISS index or creates it if it doesn't exist."""
    if os.path.exists(FAISS_INDEX_FILE):
        print("\n✅ Loading FAISS index...")
        try:
            faiss_index = faiss.read_index(FAISS_INDEX_FILE)
            print("✅ FAISS index loaded successfully!")
            return faiss_index
        except Exception as e:
            print(f"❌ Error loading FAISS: {e}")
            return None
    else:
        print("\n⚠️ FAISS index not found. Creating new index...")
        eml_df = load_eml_csv()

        if eml_df is not None:
            return create_faiss_index(eml_df)
        else:
            print("❌ Failed to create FAISS index (CSV not loaded).")
            return None



# ✅ FAISS Search with Mapping
def search_faiss(embedding, faiss_index, mapping_df, top_k=5):
    """Searches FAISS index and retrieves mapped classifications."""
    # Ensure the embedding is in correct format
    embedding = embedding.reshape(1, -1).astype('float32')

    # Perform the FAISS search
    distances, indices = faiss_index.search(embedding, top_k)

    results = []
    for i in range(len(indices[0])):
        idx = indices[0][i]
        dist = distances[0][i]

        if idx < 0 or idx >= len(mapping_df):
            continue  # Skip invalid indices

        # ✅ Retrieve classification from mapping CSV
        file_name = mapping_df.iloc[idx]['File']
        req_type = mapping_df.iloc[idx]['request_type']
        sub_req_type = mapping_df.iloc[idx]['sub_request_type']

        results.append((file_name, dist, req_type, sub_req_type))

    return results


# ✅ Function to include CSV data as context
def get_csv_context(eml_df, max_rows=20):
    """Generates a textual context from the EML → Classification CSV."""
    context = "\n--- Past Classifications ---\n"

    # ✅ Select a few random rows for diversity
    sample_df = eml_df.sample(n=min(max_rows, len(eml_df)))

    for _, row in sample_df.iterrows():
        context += (
            f"Email: {row['File']}\n"
            f"Request Type: {row['request_type']}\n"
            f"Sub Request Type: {row['sub_request_type']}\n"
            "---------------------------------\n"
        )

    return context


# ✅ Main Query Function
def query_eml(file_path, eml_df):
    """Classifies and searches for similar emails with CSV context."""

    print("\n✅ Extracting EML content...")
    eml_content = extract_eml_content(file_path)

    # ✅ Get CSV context
    csv_context = get_csv_context(eml_df)

    # ✅ Combine email content with CSV context
    full_prompt = f"{csv_context}\n--- New Email ---\n{eml_content}"

    print("\n✅ Classifying with Gemini Pro...")
    req_type, sub_req_type, reason = classify_with_gemini(full_prompt)

    print("\n✅ Generating embedding...")
    embedding = EMBED_MODEL.encode(eml_content)

    print("\n✅ Searching FAISS for similar emails...")
    results = search_faiss(embedding, faiss_index, eml_df)

    # ✅ Extract ground truth classification from CSV
    eml_file_name = file_path.split('/')[-1]
    csv_match = eml_df[eml_df['File'] == eml_file_name]

    csv_req_type = csv_match['request_type'].values[0] if not csv_match.empty else "Unknown"
    csv_sub_req_type = csv_match['sub_request_type'].values[0] if not csv_match.empty else "Unknown"

    # === Display results
    print("\n🔍 **Query Results:**")
    print(f"🔹 **LLM Classification:**")
    print(f"   - Request Type: {req_type}")
    print(f"   - Sub Request Type: {sub_req_type}")
    print(f"   - Reason: {reason}\n")

    print(f"🔹 **Ground Truth from CSV:**")
    print(f"   - Request Type: {csv_req_type}")
    print(f"   - Sub Request Type: {csv_sub_req_type}")


classifier.py          # Gemini Pro classification functions


In [None]:
!pip install fuzzywuzzy




In [None]:
import os
import re
from fuzzywuzzy import fuzz
import google.generativeai as genai

# === Configuration ===
os.environ["GEMINI_API_KEY"] = "use gemini api key"  # Replace with your key
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# ✅ Flexible schema
SCHEMA = {
    "Adjustment": [],
    "AU Transfer": [],
    "Closing Notice": ["Reallocation fees", "Amendment fees", "Reallocation principal"],
    "Commitment Change": ["Cashless roll", "Decrease", "Increase"],
    "Fee Payment": ["Ongoing fee", "Letter of credit fee"],
    "Money Movement-Inbound": ["Principal", "Interest", "Principal and Interest", "Principal, Interest, and Fee"],
    "Money Movement-Outbound": ["Timebound", "Foreign currency"]
}

def normalize_text(text):
    """Normalize and clean text for matching."""
    return text.strip().lower().replace('-', ' ').replace('_', ' ')

def fuzzy_match(target, candidates, threshold=85):
    """Fuzzy match a target against multiple candidates."""
    best_match = max(candidates, key=lambda c: fuzz.ratio(target, c))
    if fuzz.ratio(target, best_match) >= threshold:
        return best_match
    return "Unknown"

def classify_with_gemini(eml_content):
    """Classifies EML content using Gemini Pro with fuzzy schema matching."""

    prompt = f"""
Classify the email into:
- **Request Type** and **Sub Request Type** using the following schema:
{SCHEMA}

- **Important:**
    - You **must classify** the email strictly according to the schema.
    - The `Request Type` should always be one of the main categories.
    - The `Sub Request Type` must be a valid subcategory under the corresponding `Request Type`.
    - If no match is found, respond with `Unknown` for both types.

- **Reason:**
    - Provide a clear reason for the classification.

Email content:
{eml_content}
"""

    try:
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)

        if not response or not response.text:
            return "Unknown", "Unknown", "No reason provided"

        # ✅ Extract values using regex
        req_type = re.search(r"Request Type:\s*(.*)", response.text)
        sub_req_type = re.search(r"Sub Request Type:\s*(.*)", response.text)
        reason_match = re.search(r"Reason:\s*(.*)", response.text, re.DOTALL)

        req_type = req_type.group(1).strip() if req_type else "Unknown"
        sub_req_type = sub_req_type.group(1).strip() if sub_req_type else "Unknown"
        reason = reason_match.group(1).strip() if reason_match else "No reason provided"

        # ✅ Normalize
        normalized_req = normalize_text(req_type)
        normalized_sub_req = normalize_text(sub_req_type)

        # ✅ Fuzzy match request type
        matched_req_type = "Unknown"
        matched_sub_req = "Unknown"

        for req, subs in SCHEMA.items():
            if fuzz.ratio(normalized_req, normalize_text(req)) >= 85:
                matched_req_type = req

                # Fuzzy match sub-request type
                if subs:
                    matched_sub_req = fuzzy_match(normalized_sub_req, [normalize_text(sub) for sub in subs])

        return matched_req_type, matched_sub_req, reason

    except Exception as e:
        print(f"❌ Error in Gemini Pro classification: {e}")
        return "Unknown", "Unknown", "No reason provided"


main.py

In [None]:
# ✅ Load FAISS and EML CSV
EML_FILE = "/content/synthetic_eml_files/Fee_Payment_Loc_1.eml"
faiss_index = load_faiss()
eml_df = pd.read_csv(EML_CSV_FILE)

# ✅ Run Query
if faiss_index is not None and eml_df is not None:
    query_eml(EML_FILE, eml_df)
else:
    print("❌ Failed to load FAISS or EML CSV.")


✅ Loading FAISS index...
✅ FAISS index loaded successfully!

✅ Extracting EML content...

✅ Classifying with Gemini Pro...

✅ Generating embedding...

✅ Searching FAISS for similar emails...

🔍 **Query Results:**
🔹 **LLM Classification:**
   - Request Type: Fee Payment
   - Sub Request Type: ongoing fee
   - Reason: The email explicitly questions an ongoing "Line of Credit Fee" and seeks clarification on its nature and potential waiver. This aligns with the "Fee Payment" request type and the "Ongoing fee" sub-type.

🔹 **Ground Truth from CSV:**
   - Request Type: Unknown
   - Sub Request Type: Unknown
