In [1]:
!pip install raganything

Collecting raganything
  Downloading raganything-1.2.8-py3-none-any.whl.metadata (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.3/54.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightrag-hku (from raganything)
  Downloading lightrag_hku-1.4.8.2-py3-none-any.whl.metadata (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mineru[core] (from raganything)
  Downloading mineru-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting configparser (from lightrag-hku->raganything)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting dotenv (from lightrag-hku->raganything)
  Downloading dotenv-0.9.9-py2.py3-none

# Email RAG Pipeline

This notebook demonstrates a simple RAG (Retrieval-Augmented Generation) pipeline for processing downloaded Outlook emails.


In [2]:
import os
import email
import json
from pathlib import Path
from typing import Dict, List, Any
import pandas as pd

# For vector operations (you'll need to install these)
import numpy as np
#from sentence_transformers import SentenceTransformer
#import faiss

import asyncio
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc


## 1. Load Email Files from Local Directory

Load your downloaded Outlook email files (.eml, .msg, or .pst files) from a local directory


In [3]:
import os
from pathlib import Path
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Point to the directory in Drive where your emails are stored
# Make sure you created this folder inside your Drive and uploaded .eml/.msg/.pst files there
email_directory = "/content/drive/My Drive/MMV_email_rag/raw_emails"
rag_storage_directory = "/content/drive/My Drive/MMV_email_rag/rag_storage"
email_texts_directory = "/content/drive/My Drive/MMV_email_rag/email_texts"

# Step 3: Collect all email paths
email_paths = {}
email_files = []

if os.path.exists(email_directory):
    for ext in ['*.eml', '*.msg', '*.pst']:
        email_files.extend(Path(email_directory).glob(ext))

    for i, file_path in enumerate(email_files):
        email_paths[f"Email-{i+1}"] = str(file_path)

    print(f"Found {len(email_paths)} email files in {email_directory}:")
    for name, path in email_paths.items():
        print(f"  {name}: {os.path.basename(path)}")
else:
    print(f"Directory {email_directory} not found!")
    print("Please create the directory in Google Drive and upload your email files there.")
    print("Example: Place your files in 'My Drive/raw_emails'")

Mounted at /content/drive
Found 9 email files in /content/drive/My Drive/MMV_email_rag/raw_emails:
  Email-1: First Merchants Bank Royal Oak_ Invitation to bid on First Merchants Bank.eml
  Email-2: Invitation to Bid - Burlington - Louisville, KY _ Bids Due 10_15_25.eml
  Email-3: Fw_ Follow Up_ Invitation To Bid - Heartland Dental - Shelby Township, MI TI.eml
  Email-4: RE_ Invitation to Bid -  Bath & Body Works #5027 - Burton, MI.eml
  Email-5: INVITATION TO BID_ AutoZone - Weston, WV.eml
  Email-6: Fw_ Grosse Ile Country Club Pool House Renovation Invitation To Bid.eml
  Email-7: Invitation to Bid - Tractor Supply - Johnstown, OH.eml
  Email-8: Fw_ Kresge Eye Institute - bathrooms renovation; 29201 Telegraph Road, Southfield MI 48034.eml
  Email-9: Fw_ INVITATION TO BID - 115_230 E. Hudson Avenue, Royal Oak MI 48067.eml


## 2. Email Parsing Functions


In [4]:
import os
import re
from pathlib import Path
from email import policy
from email.parser import BytesParser


def eml_to_text(eml_path):
    """Convert an .eml file into a flattened string."""
    with open(eml_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)

    subject = (msg.get('Subject') or '').strip()
    sender = (msg.get('From') or '').strip()
    recipients = ", ".join(msg.get_all('To', []) or [])
    date = (msg.get('Date') or '').strip()

    # Extract body (prefer plain text, fallback to stripped HTML)
    body_text = ""
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_disposition() == 'attachment':
                continue
            if part.get_content_type() == 'text/plain':
                body_text += part.get_content() or ''
            elif part.get_content_type() == 'text/html' and not body_text:
                html = part.get_content() or ''
                body_text = re.sub(r'<[^>]+>', ' ', html)
    else:
        body_text = msg.get_content() or ''

    body_text = re.sub(r'\s+', ' ', body_text).strip()

    email_text = (
        f"subject: {subject}\n"
        f"from: {sender}\n"
        f"to: {recipients}\n"
        f"date: {date}\n\n"
        f"body:\n{body_text}\n"
    )
    return email_text, subject


def save_emails_as_txt(email_dir, output_dir):
    """Convert .eml files into plain text .txt files for RAG ingestion."""
    os.makedirs(output_dir, exist_ok=True)
    email_files = list(Path(email_dir).glob("*.eml"))

    saved_txts = []

    for i, eml_file in enumerate(email_files, 1):
        email_text, subject = eml_to_text(eml_file)

        safe_subject = re.sub(r'[\\/*?:"<>|]', "_", subject).strip() or f"Email_{i}"
        txt_filename = os.path.join(output_dir, f"{safe_subject}_{i}.txt")

        with open(txt_filename, "w", encoding="utf-8") as f:
            f.write(email_text)

        saved_txts.append(txt_filename)
        print(f"Saved {txt_filename}")

    return saved_txts

saved_text_files = save_emails_as_txt(email_directory, email_texts_directory)

print("\nAll saved text files:")
for txt in saved_text_files:
    print(txt)

Saved /content/drive/My Drive/MMV_email_rag/email_texts/First Merchants Bank Royal Oak_ Invitation to bid on First Merchants Bank_1.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/Invitation to Bid - Burlington - Louisville, KY _ Bids Due 10_15_25_2.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/Fw_ Follow Up_ Invitation To Bid - Heartland Dental - Shelby Township, MI TI_3.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/RE_ Invitation to Bid -  Bath & Body Works #5027 - Burton, MI_4.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/INVITATION TO BID_ AutoZone - Weston, WV_5.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/Fw_ Grosse Ile Country Club Pool House Renovation Invitation To Bid_6.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/Invitation to Bid - Tractor Supply - Johnstown, OH_7.txt
Saved /content/drive/My Drive/MMV_email_rag/email_texts/Fw_ Kresge Eye Institute - bathrooms renovation; 29201 Telegraph Road

info i need to get out of email (including pdf's later, for now just text) for lead input in sage:

1. name of project
2. bid due date and time
3. address/location of project
4. general contractor (origin of email)
5. contact of general contractor (person who sent the email's first and last name)
6. anything to take note of (site walkthrough time, estimated start date of construction, etc)

In [5]:
from pathlib import Path
from google.colab import userdata

async def example_raganything():
    api_key = userdata.get('OPENAI_API_KEY')

    # Config
    config = RAGAnythingConfig(
        working_dir=rag_storage_directory,
        parser="mineru",
        parse_method="txt",   # Force text parsing
        enable_image_processing=False,
        enable_table_processing=False,
        enable_equation_processing=False,
    )

    # LLM function
    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
        return openai_complete_if_cache(
            "gpt-4o-mini",
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            api_key=api_key,
            **kwargs,
        )

    # Embedding function
    embedding_func = EmbeddingFunc(
        embedding_dim=3072,
        max_token_size=8192,
        func=lambda texts: openai_embed(
            texts,
            model="text-embedding-3-large",
            api_key=api_key,
        ),
    )

    # Init RAG
    rag = RAGAnything(
        config=config,
        llm_model_func=llm_model_func,
        vision_model_func=None,
        embedding_func=embedding_func,
    )

    # Grab first .txt email
    txt_files = list(Path(email_texts_directory).glob("*.txt"))
    if not txt_files:
        print("No TXT email files found in", email_texts_directory)
        return
    txt_path = str(txt_files[0])
    print(f"Processing TXT email: {txt_path}")

    # Process as plain text
    await rag.process_document_complete(
        file_path=txt_path,
        output_dir=rag_storage_directory,
        parse_method="txt"
    )

    # Query
    query = "What project is this email about?"
    result = await rag.aquery(query, mode="hybrid")
    print("Query:", query)
    print("Result:", result)

await example_raganything()

INFO: RAGAnything initialized with config:
INFO:   Working directory: /content/drive/My Drive/MMV_email_rag/rag_storage
INFO:   Parser: mineru
INFO:   Parse method: txt
INFO:   Multimodal processing - Image: False, Table: False, Equation: False
INFO:   Max concurrent files: 1


Processing TXT email: /content/drive/My Drive/MMV_email_rag/email_texts/First Merchants Bank Royal Oak_ Invitation to bid on First Merchants Bank_1.txt


INFO: Parser 'mineru' installation verified
INFO: Initializing LightRAG with parameters: {'working_dir': '/content/drive/My Drive/MMV_email_rag/rag_storage'}
INFO: [_] Created new empty graph fiel: /content/drive/My Drive/MMV_email_rag/rag_storage/graph_chunk_entity_relation.graphml
INFO: [_] Process 213 KV load full_docs with 0 records
INFO: [_] Process 213 KV load text_chunks with 0 records
INFO: [_] Process 213 KV load full_entities with 0 records
INFO: [_] Process 213 KV load full_relations with 0 records
INFO: [_] Process 213 KV load llm_response_cache with 0 records
INFO: [_] Process 213 doc status load doc_status with 0 records
INFO: [_] Process 213 KV load parse_cache with 2 records
INFO: Multimodal processors initialized with context support
INFO: Available processors: ['generic']
INFO: Context configuration: ContextConfig(context_window=1, context_mode='page', max_context_tokens=2000, include_headers=True, include_captions=True, filter_content_types=['text'])
INFO: LightRAG, 

Query: What project is this email about?
Result: The email is about the **First Merchants Bank Royal Oak** project. It is an invitation to bid on this project, which is being managed by PCI Industries, Inc. The email provides details about the bid due date and offers methods for bid submission. 

**References:**
- [KG] First Merchants Bank Royal Oak
- [KG] PCI Industries, Inc ~ First Merchants Bank Royal Oak
- [DC] First Merchants Bank Royal Oak_ Invitation to bid on First Merchants Bank_1.txt
