<a href="https://colab.research.google.com/github/Sathvik816/Gundasaisathvik.github.io/blob/main/hackathon2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas eml-parser pdfplumber pymupdf pillow pytesseract python-docx faiss-cpu

In [6]:
import os
import json
import pandas as pd
import eml_parser
import base64
from io import BytesIO
from PIL import Image
import pytesseract
import fitz  # PyMuPDF for PDFs
from docx import Document

def load_labels(csv_path):
    """Load classification labels from CSV into a dictionary with cleaned paths"""
    df = pd.read_csv(csv_path, encoding='latin-1')

    # Clean the text and paths
    def clean_text(text):
        if pd.isna(text):
            return ''
        return (text
                .replace('â', '"')
                .replace('â', '"')
                .replace('â', '-')
                .replace('â¢', '*')
                .replace('â¢', '™')
                .replace('â¦', '...')
                .replace('â', "'")
                .strip('"').strip())

    df = df.applymap(clean_text)

    # Normalize file paths and names to lowercase
    df['File'] = df['File'].str.lower().str.strip()

    # Map full paths to labels
    label_map = {
        row['File']: {
            'request_type': row['request_type'],
            'sub_request_type': row['sub_request_type'],
            'reason': row['reason']
        } for _, row in df.iterrows()
    }

    return label_map


def extract_eml_content(eml_file_path):
    """Extract text and attachments from EML file"""
    with open(eml_file_path, 'rb') as f:
        raw_email = f.read()

    parser = eml_parser.EmlParser(include_raw_body=True)
    parsed_eml = parser.decode_email_bytes(raw_email)

    # Extract EML text content
    eml_text = ''
    if 'body' in parsed_eml:
        for body in parsed_eml['body']:
            eml_text += body.get('content', '') + '\n'

    # Extract attachments
    attachments = []
    if 'attachment' in parsed_eml:
        for att in parsed_eml['attachment']:
            att_name = att.get('filename')
            att_data = att.get('raw') or att.get('data')

            if att_data is None:
                print(f"Warning: Could not find attachment data for {att_name}")
                continue

            att_data = base64.b64decode(att_data)
            att_text = extract_attachment_text(att_name, att_data)

            attachments.append({
                'filename': att_name,
                'text': att_text
            })

    return {
        'file': eml_file_path.lower(),  # Use full lowercase path for matching
        'text': eml_text,
        'attachments': attachments
    }


def extract_attachment_text(filename, data):
    """Extract text from different attachment types"""
    ext = filename.split('.')[-1].lower()

    if ext in ['jpg', 'jpeg', 'png']:
        return extract_image_text(data)
    elif ext == 'pdf':
        return extract_pdf_text(data)
    elif ext in ['doc', 'docx']:
        return extract_doc_text(data)
    else:
        return f"Unsupported attachment type: {ext}"


def extract_image_text(data):
    """Extract text from image attachments using OCR"""
    try:
        image = Image.open(BytesIO(data))
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"Failed to extract image text: {str(e)}"


def extract_pdf_text(data):
    """Extract text from PDF attachments"""
    try:
        with fitz.open(stream=BytesIO(data), filetype='pdf') as pdf:
            text = ''
            for page_num in range(len(pdf)):
                text += pdf[page_num].get_text()
            return text
    except Exception as e:
        return f"Failed to extract PDF text: {str(e)}"


def extract_doc_text(data):
    """Extract text from DOC/DOCX attachments"""
    try:
        with open("temp.docx", "wb") as temp_file:
            temp_file.write(data)

        doc = Document("temp.docx")
        text = '\n'.join([para.text for para in doc.paragraphs])

        os.remove("temp.docx")
        return text
    except Exception as e:
        return f"Failed to extract DOC text: {str(e)}"


def process_eml_folder_with_labels(folder_path, labels_csv, output_file):
    """Process EML files and include classification labels with reasons"""

    # Load labels with cleaned paths
    label_map = load_labels(labels_csv)

    eml_data = []

    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.eml'):
                eml_file = os.path.join(root, filename)
                print(f"Processing {eml_file}...")

                # Extract content
                eml_content = extract_eml_content(eml_file)

                # Match by full path
                labels = label_map.get(eml_file, {
                    'request_type': 'Unknown',
                    'sub_request_type': 'Unknown',
                    'reason': 'No reason provided'
                })

                # Add labels to content
                eml_content.update(labels)
                eml_data.append(eml_content)

    # Save everything to JSON
    with open(output_file, 'w') as f:
        json.dump(eml_data, f, indent=4)

    print(f"\n✅ Extraction completed. Data saved to {output_file}")


# Example usage
process_eml_folder_with_labels(
    '/content/synthetic_eml_files',  # Folder containing EML files
    '/content/CSV files/eml_classification_mapping.csv',  # CSV path
    'classified_eml_data.json'  # Output JSON
)


  df = df.applymap(clean_text)


Processing /content/synthetic_eml_files/variant_2_Fee_Payment_Ongoing_Fee_1.eml...
Processing /content/synthetic_eml_files/sample1_variant_3.eml...
Processing /content/synthetic_eml_files/variant_2_Fee_Payment_Ongoing_Fee_2.eml...
Processing /content/synthetic_eml_files/Fee_Payment_Ongoing_Fee_3.eml...
Processing /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile1.eml...
Processing /content/synthetic_eml_files/Fee_Payment_Loc_1.eml...
Processing /content/synthetic_eml_files/Money_Movement_Outbound_Timebound_2.eml...
Processing /content/synthetic_eml_files/Money_Movement_Outbound_Foreign_Currency_1.eml...
Processing /content/synthetic_eml_files/CommitmentChangeCashlessSamplefile4.eml...
Processing /content/synthetic_eml_files/Money_Movement_Outbound_Timebound_1.eml...
Processing /content/synthetic_eml_files/sample1_variant_8.eml...
Processing /content/synthetic_eml_files/sample1_variant_6.eml...
Processing /content/synthetic_eml_files/sample2_variant_10.eml...
Processing /cont

In [7]:
import json

with open('classified_eml_data.json', 'r') as f:
    data = json.load(f)

print(data)

[{'file': '/content/synthetic_eml_files/variant_2_fee_payment_ongoing_fee_1.eml', 'text': 'Description: Clarification on Ongoing Fee Charges – Account #123456\n\n\nDear Himalaya Bank Customer Support,\n\nI noticed an ongoing fee of $37 deducted from my account this month. Could you please provide a detailed breakdown of this fee?\n\nAdditionally, I would like to confirm:\n\nIf there are any changes to my ongoing fee structure.\n\nIf I qualify for any fee waivers or discounts.\n\nPlease let me know if you need any further information.\n\nWith Regards,\nOlivia Wilson\nTelephone #:\n\n\n', 'attachments': [], 'request_type': 'Unknown', 'sub_request_type': 'Unknown', 'reason': 'No reason provided'}, {'file': '/content/synthetic_eml_files/sample1_variant_3.eml', 'text': 'Deal CUSIP: 30303CCD2\nDeal ISIN: UK40404DDE38\nFacility CUSIP: 50505EEF4\nLender MEI: EU8K907234\n\nEffective 15-Mar-2025, Sophia White LP has elected to repay a total of 8130\n\nCurrent Principal Balance: USD 47031\nNew Pr

In [8]:
!pip install faiss-cpu sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [12]:
import json
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import pickle

# === Load EML and CSV Data ===

def load_eml_data(json_file):
    """Load EML data from a JSON file"""
    with open(json_file, 'r') as f:
        eml_data = json.load(f)
    return eml_data


def load_csv_mapping(csv_file):
    """Load CSV mapping into a list of dictionaries"""
    df = pd.read_csv(csv_file)
    mapping = df.to_dict(orient='records')  # Convert to list of dictionaries
    return mapping


# === FAISS Index Functions ===

def create_faiss_index(embedding_dim):
    """Initialize a FAISS index for storing vectors"""
    index = faiss.IndexFlatL2(embedding_dim)
    return index


# === Vectorizing EML + CSV Data ===

def vectorize_eml_with_csv(eml_data, csv_mapping, model):
    """Convert EML + CSV content into embeddings"""
    index = create_faiss_index(embedding_dim=384)
    metadata = []

    for idx, eml in enumerate(eml_data):
        # Combine EML text and attachment text
        full_text = eml['text']
        for attachment in eml['attachments']:
            full_text += attachment['text'] + '\n'

        # Add CSV reason based on request_type & sub_request_type
        csv_reason = ''
        for mapping in csv_mapping:
            if (mapping['request_type'] == eml['request_type'] and
                mapping['sub_request_type'] == eml['sub_request_type']):
                csv_reason = mapping.get('reason', '')
                break

        # Combine EML text with CSV reason
        combined_text = f"{full_text}\nReason: {csv_reason}"

        # Generate embeddings
        vector = model.encode(combined_text).reshape(1, -1)
        index.add(vector)

        # Store metadata
        metadata.append({
            'file': eml['file'],
            'request_type': eml['request_type'],
            'sub_request_type': eml['sub_request_type'],
            'reason': csv_reason,
            'text': combined_text
        })

        print(f"✅ Vectorized {eml['file']} with CSV reason")

    return index, metadata


# === Saving the FAISS Index and Metadata ===

def save_faiss_index(index, metadata, index_file='faiss_index.bin', metadata_file='metadata.pkl'):
    """Save FAISS index and metadata"""
    faiss.write_index(index, index_file)

    # Save metadata separately
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata, f)

    print(f"\n✅ FAISS index saved to {index_file}")
    print(f"✅ Metadata saved to {metadata_file}")


# === RUN THE VECTORIZATION PROCESS ===

# Load EML and CSV data
eml_data = load_eml_data('classified_eml_data.json')
csv_mapping = load_csv_mapping('/content/CSV files/eml_classification_mapping.csv')  # New CSV mapping file

# Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Vectorize combined EML + CSV content
faiss_index, metadata = vectorize_eml_with_csv(eml_data, csv_mapping, model)

# Save the FAISS index and metadata
save_faiss_index(faiss_index, metadata)


✅ Vectorized /content/synthetic_eml_files/variant_2_fee_payment_ongoing_fee_1.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/sample1_variant_3.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/variant_2_fee_payment_ongoing_fee_2.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/fee_payment_ongoing_fee_3.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/adjustmentrequesttypesamplefile1.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/fee_payment_loc_1.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/money_movement_outbound_timebound_2.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/money_movement_outbound_foreign_currency_1.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/commitmentchangecashlesssamplefile4.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/money_movement_outbound_timebound_1.eml with CSV reason
✅ Vectorized /content/synthetic_eml_files/sample1_variant_8.em

STEP 3 INTEGRATION WITH GEMINI PRO

In [13]:
!pip install google-generativeai pandas



In [14]:
import os
import pandas as pd
import google.generativeai as genai
import json
import re
import time
from google.api_core.exceptions import ResourceExhausted

# === Configure Gemini Pro ===
GEMINI_API_KEY = "AIzaSyDl4zitAJPnmRXLwgpeVzSDAvqIxqKg75g"  # Replace with your Gemini Pro API key
genai.configure(api_key=GEMINI_API_KEY)

# === Load the EML-to-classification mapping CSV ===
input_csv = "/content/CSV files/eml_classification_mapping.csv"
df = pd.read_csv(input_csv)

# === Clean the 'File' column in the CSV ===
df['File'] = df['File'].str.replace('“', '').str.replace('”', '')
df.to_csv(input_csv, index=False)

# === Read EML content function ===
def read_eml_content(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print(f"⚠️ File not found: {file_path}")
        return ""
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        return ""

# === Generate LLM prompt ===
def generate_prompt(eml_content):
    return f"""
You are an expert email classifier with domain knowledge of financial operations.

Here is the content of an email:

--- BEGIN EMAIL CONTENT ---
{eml_content}
--- END EMAIL CONTENT ---

Classify the email into:
- **Request Type** and **Sub Request Type** with specific mappings:
    - **Adjustment**
        - Fee correction
        - Interest correction
    - **AU Transfer**
        - Transfer between accounts
        - Transfer to external bank
    - **Closing Notice**
        - Reallocation fees
        - Amendment fees
        - Reallocation principal
    - **Commitment Change**
        - Cashless roll
        - Decrease
        - Increase
    - **Fee Payment**
        - Ongoing fee
        - Letter of credit fee
    - **Money Movement-Inbound**
        - Principal
        - Interest
        - Principal and Interest
        - Principal, Interest, and Fee
    - **Money Movement-Outbound**
        - Timebound
        - Foreign currency

- **Reason:** Provide a clear reason why the email is classified into the selected request type and sub-request type, including any relevant details, terms, or references from the email content.

📌 **Return the output in the following format:**

json {{ "Request Type": "", "Sub Request Type": "", "Reason": "" }}
"""

# === Rate limiting and retries ===
def classify_eml_with_llm(eml_file, max_retries=5, base_delay=2):
    """Classify the EML content using Gemini Pro with rate limiting and retries."""
    eml_content = read_eml_content(eml_file)

    if not eml_content:
        return "Error", "Error", "Failed to read EML"

    prompt = generate_prompt(eml_content)

    retry_count = 0

    while retry_count < max_retries:
        try:
            model = genai.GenerativeModel("gemini-1.5-pro-latest")
            response = model.generate_content(prompt)

            json_match = re.search(r'\{.*\}', response.text, re.DOTALL)

            if json_match:
                json_str = json_match.group(0)
                try:
                    classification_result = json.loads(json_str)
                    return (
                        classification_result.get("Request Type", "Unknown"),
                        classification_result.get("Sub Request Type", "Unknown"),
                        classification_result.get("Reason", "No reason provided")
                    )
                except json.JSONDecodeError:
                    print(f"⚠️ Failed to parse JSON for {eml_file}: {json_str}")
                    return "Error", "Error", "Invalid JSON format"
            else:
                print(f"⚠️ No JSON response for {eml_file}. Response: {response.text}")
                return "Error", "Error", "No JSON returned"

        except ResourceExhausted:
            print(f"⚠️ Quota exceeded. Retrying in {base_delay} seconds...")
            time.sleep(base_delay)
            retry_count += 1
            base_delay *= 2  # Exponential backoff

        except Exception as e:
            print(f"❌ Error processing {eml_file}: {e}")
            return "Error", "Error", "Error"

    print(f"❌ Max retries reached for {eml_file}. Skipping.")
    return "Error", "Error", "Quota exceeded"

# === Process all EML files ===
output_data = []

for index, row in df.iterrows():
    eml_file = row["File"].strip()

    if not os.path.isfile(eml_file):
        print(f"⚠️ Skipping missing file: {eml_file}")
        output_data.append({
            "file": eml_file,
            "llm_request_type": "Error",
            "llm_sub_request_type": "Error",
            "llm_reason": "File not found"
        })
        continue

    request_type, sub_request_type, reason = classify_eml_with_llm(eml_file)

    output_data.append({
        "file": eml_file,
        "llm_request_type": request_type,
        "llm_sub_request_type": sub_request_type,
        "llm_reason": reason
    })

    print(f"✅ Classified: {eml_file} → {request_type}, {sub_request_type}, Reason: {reason}")

# === Save LLM classifications to CSV ===
output_df = pd.DataFrame(output_data)
output_csv = "eml_classification_with_llm.csv"
output_df.to_csv(output_csv, index=False)
print(f"\n✅ LLM classifications saved to: {output_csv}")


✅ Classified: /content/synthetic_eml_files/sample1_variant_2.eml → Money Movement-Inbound, Principal, Reason: The email clearly states that James Lee LP has "elected to repay a total of 11794" and provides details of the current and new principal balances.  This indicates a principal repayment towards a loan. While the exact amount repaid isn't explicitly stated (the difference between old and new principal is larger than 11794), the core action is a principal payment being received by the bank.
✅ Classified: /content/synthetic_eml_files/sample1_variant_3.eml → Money Movement-Inbound, Principal, Reason: The email clearly states "Sophia White LP has elected to repay a total of 8130" and provides details of the current and new principal balances.  The attachment titled "Payment_Receipt.pdf" further supports this being a principal repayment. While the email mentions a "Loan", there is no mention of interest or fees being paid, focusing solely on the change in principal balance.  This alig



❌ Error processing /content/synthetic_eml_files/sample1_variant_5.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_5.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample1_variant_6.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_6.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample1_variant_7.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_7.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample1_variant_8.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_8.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample1_variant_9.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_9.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample1_variant_10.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample1_variant_10.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_2.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_2.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_3.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_3.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_4.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_4.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_5.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_5.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_6.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_6.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_7.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_7.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_8.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_8.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_9.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_9.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/sample2_variant_10.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/sample2_variant_10.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile1.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile1.eml → Error, Error, Reason: Error




❌ Error processing /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile2.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile2.eml → Error, Error, Reason: Error
❌ Error processing /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile3.eml: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
✅ Classified: /content/synthetic_eml_files/AdjustmentRequestTypeSampleFile3.eml → Error, Error, 



 VECTORISE THE EML CONTENT WITH THE CLASSIFICATION POST LLM OUTPUT

In [None]:
!pip install faiss-cpu sentence-transformers pandas


In [16]:
import os
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# === Configuration ===
eml_dir = "synthetic_eml_files"
csv_file = "/content/CSV files/eml_classification_mapping.csv"
faiss_index_file = os.path.join("/content/FAISS", "faiss_index_with_llm_classifications.idx")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast model

# === Load the CSV with LLM classifications ===
df = pd.read_csv(csv_file)

# === Function to read EML content ===
def read_eml_content(file_path):
    """Reads the content of an EML file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# === Create embeddings with LLM classifications ===
texts = []
file_paths = []

for _, row in df.iterrows():
    eml_file = row["File"]
    eml_path = os.path.join(eml_dir, os.path.basename(eml_file))

    # Read EML content
    eml_content = read_eml_content(eml_path)

    # Combine with LLM classifications
    llm_request_type = row["request_type"]
    llm_sub_request_type = row["sub_request_type"]
    llm_reason = row["reason"]

    # Create a combined text representation
    combined_text = f"""
    Email Content:
    {eml_content}

    LLM Classification:
    - Request Type: {request_type}
    - Sub Request Type: {sub_request_type}
    - Reason: {reason}
    """

    texts.append(combined_text)
    file_paths.append(eml_file)

# === Generate embeddings ===
print("\n✅ Generating embeddings with LLM classifications...")
embeddings = model.encode(texts, show_progress_bar=True)

# === Create FAISS index ===
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to FAISS format and add to index
faiss_index = faiss.IndexIDMap(index)
faiss_index.add_with_ids(np.array(embeddings), np.arange(len(texts)))

# === Save the FAISS index ===
faiss.write_index(faiss_index, faiss_index_file)
print(f"\n✅ FAISS index saved to: {faiss_index_file}")

# === Save the mapping of EML files and IDs
mapping_file = os.path.join("/content/FAISS", "faiss_mapping_with_llm.csv")
mapping_df = pd.DataFrame({"file": file_paths, "id": np.arange(len(file_paths))})
mapping_df.to_csv(mapping_file, index=False)
print(f"\n✅ Mapping saved to: {mapping_file}")



✅ Generating embeddings with LLM classifications...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


✅ FAISS index saved to: /content/FAISS/faiss_index_with_llm_classifications.idx

✅ Mapping saved to: /content/FAISS/faiss_mapping_with_llm.csv



FINAL STEP




In [17]:
!pip install google-generativeai faiss-cpu sentence-transformers pandas
!pip install -U google-generativeai





In [20]:
import os
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from google.generativeai import GenerativeModel
from email import message_from_file

# === Configurations ===
eml_dir = "synthetic_eml_files"
faiss_index_file = "/content/FAISS/faiss_index_with_llm_classifications.idx"
mapping_file = "/content/FAISS/faiss_mapping_with_llm.csv"

# === Google Gemini Pro Configuration ===
import google.generativeai as genai

# ✅ Environment variable for security
os.environ["GEMINI_API_KEY"] = "AIzaSyDl4zitAJPnmRXLwgpeVzSDAvqIxqKg75g" # Replace with your key
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# === Load FAISS index and mapping ===
print("\n✅ Loading FAISS index...")
faiss_index = faiss.read_index(faiss_index_file)
mapping_df = pd.read_csv(mapping_file)

# === Load Sentence Transformer model ===
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# === Function to extract text from EML ===
def extract_eml_content(file_path):
    """Extracts text from an EML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        msg = message_from_file(f)

    subject = msg.get('Subject', '')
    body = ""

    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
    else:
        body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')

    return f"Subject: {subject}\n\n{body}"

# === Gemini Classification ===
def classify_with_gemini(eml_content):
    """Classifies the EML content using Gemini Pro."""

    prompt = f"""
    Classify the following email into:
    - Request Type
    - Sub Request Type
    - Reason for classification

    Email content:
    {eml_content}
    """

    try:
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)

        if response and response.text:
            lines = response.text.split("\n")

            # Extract request type, sub-request type, and reason
            request_type = "Unknown"
            sub_request_type = "Unknown"
            reason = "No reason provided"

            for line in lines:
                if "Request Type:" in line:
                    request_type = line.split(":")[1].strip()
                elif "Sub Request Type:" in line:
                    sub_request_type = line.split(":")[1].strip()
                else:
                    reason = "\n".join(lines[2:]).strip()

            return request_type, sub_request_type, reason

        else:
            return "Unknown", "Unknown", "No reason provided"

    except Exception as e:
        print(f"❌ Error in Gemini Pro classification: {e}")
        return "Unknown", "Unknown", "No reason provided"

# === Function to perform FAISS search ===
def search_faiss(embedding, top_k=3):
    """Search FAISS for similar emails."""
    distances, ids = faiss_index.search(np.array([embedding]), top_k)

    results = []
    for dist, idx in zip(distances[0], ids[0]):
        if idx == -1:
            continue

        file_name = mapping_df.loc[mapping_df['id'] == idx, 'file'].values[0]

        # Check if the required columns exist, and if not, skip the row
        if 'request_type' not in mapping_df.columns or 'sub_request_type' not in mapping_df.columns:
            print(f"Warning: Skipping row with id {idx} as it doesn't have the required columns.")
            continue

        # Retrieve labels from CSV, corrected column name
        stored_label = mapping_df.loc[mapping_df['id'] == idx, ['request_type', 'sub_request_type']].values[0]
        req_type, sub_req_type = stored_label

        results.append((file_name, dist, req_type, sub_req_type))

    return results

# === Query with EML file ===
def query_eml(file_path):
    """Classifies a new EML file and compares LLM classification with FAISS/CSV."""

    print("\n✅ Extracting content from EML...")
    eml_content = extract_eml_content(file_path)

    print("\n✅ Classifying with Gemini Pro...")
    req_type, sub_req_type, reason = classify_with_gemini(eml_content)

    print("\n✅ Generating embedding for EML content...")
    embedding = embed_model.encode(eml_content)

    print("\n✅ Searching FAISS for similar emails...")
    results = search_faiss(embedding)

    # === Display the results ===
    print("\n🔍 **Query Results:**\n")

    print(f"🔹 **LLM Classification:**")
    print(f"   - Request Type: {req_type}")
    print(f"   - Sub Request Type: {sub_req_type}")
    print(f"   - Reason: {reason}")

    print("\n🔹 **Closest Matching Emails with CSV Labels:**")
    if results:
        for file_name, dist, csv_req_type, csv_sub_req_type in results:
            print(f"   - {file_name} (Distance: {dist:.4f})")
            print(f"     - CSV Request Type: {csv_req_type}")
            print(f"     - CSV Sub Request Type: {csv_sub_req_type}")

            # ✅ Check for mismatches
            if (req_type != csv_req_type) or (sub_req_type != csv_sub_req_type):
                print("⚠️ **Mismatch Detected!**")
                print(f"   - LLM: {llm_req_type} / {llm_sub_req_type}")
                print(f"   - CSV: {csv_req_type} / {csv_sub_req_type}")
            print("-----------------------------------------------------")

    else:
        print("⚠️ No matching emails found in FAISS index.")

# === Example Usage ===
# Specify the path to your new EML file for testing
new_eml_file = "/content/synthetic_eml_files/sample1_variant_1.eml" # Replace with your file
query_eml(new_eml_file)


✅ Loading FAISS index...

✅ Extracting content from EML...

✅ Classifying with Gemini Pro...

✅ Generating embedding for EML content...

✅ Searching FAISS for similar emails...

🔍 **Query Results:**

🔹 **LLM Classification:**
   - Request Type: ** Partial Prepayment
   - Sub Request Type: Unknown
   - Reason: * **Reason for classification:** The email explicitly states that David Martinez LP has "elected to repay a total of 7689".  While it doesn't specify the currency, the context (USD amounts elsewhere) strongly implies USD.  Since the current principal is higher than the new principal, this signifies a partial prepayment rather than a full repayment. The email provides all the necessary identifying information related to the loan, including lender, deal, facility identifiers, and effective dates.

🔹 **Closest Matching Emails with CSV Labels:**
⚠️ No matching emails found in FAISS index.
