Extracting Email Content

In [None]:
import imapclient
import email
from email import policy
from email.header import decode_header
import os
import pdfplumber
from PIL import Image
import pandas as pd

In [25]:
from imapclient import IMAPClient
def connect_to_email(username, password, imap_server):
    client = imapclient.IMAPClient("imap.gmail.com", ssl=True)
    client.login(username, password)
    return client

In [26]:
from email import message_from_bytes
from email.policy import default as email_policy

def fetch_emails(client, folder="INBOX", max_emails=10):
    # Select the folder (e.g., "INBOX")
    client.select_folder(folder)
    
    # Fetch only unread emails in the Primary inbox
    messages = client.search(['X-GM-RAW', 'category:primary is:unread'])
    
    # Sort messages by UID in descending order to process the latest emails first
    messages = sorted(messages, reverse=True)
    
    # Process emails up to the specified max_emails limit
    processed_count = 0
    for uid in messages:
        if processed_count >= max_emails:  # Stop once the limit is reached
            break
        raw_message = client.fetch(uid, "RFC822")[uid][b"RFC822"]
        message = message_from_bytes(raw_message, policy=email_policy)
        processed_count += 1
        yield message


In [27]:
import re
def clean_email_text(email_body):
    cleaned_text = re.sub(r'\[Attachment: .*?\]', '', email_body)
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

In [28]:
# Function to process each email and extract subject, body, and attachments
def process_email(message):
    subject = decode_header(message["subject"])[0][0]
    if isinstance(subject, bytes):
        subject = subject.decode()

    body = ""
    attachments = []
    has_attachments=False
    # Extract email body and attachments
    if message.is_multipart():
        for part in message.iter_parts():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition"))

            # If it's a text part, extract the body
            if "text" in content_type:
                body += part.get_payload(decode=True).decode()
            # If it's an attachment, save it
            if "attachment" in content_disposition:
                has_attachments = True
                filename = part.get_filename()
                attachment_data = part.get_payload(decode=True)
                attachments.append((filename, attachment_data))
    else:
        body = message.get_payload(decode=True).decode()

   

    body= clean_email_text(body)
    subject=clean_email_text(subject)
    return subject, body, attachments, has_attachments


In [29]:
def prepare_input(subject, body, has_attachment):
    attachment_feature = f" [ATTACHMENT_SIGNAL: {'PRESENT' if has_attachment else 'ABSENT'}]"
    combined_text = f"Subject Prefix: {'PO' if 'purchase order' in subject.lower() else 'GENERIC'} Subject: {subject} Body: {body}{attachment_feature}"
    return combined_text

def tokenize_input(text, tokenizer):
    return tokenizer(text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

Preparaing Data and Model Finetuning

In [30]:
from transformers import BertForSequenceClassification
import torch
from transformers import BertTokenizer

In [31]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
def load_data_from_csv(csv_file):
    
    df = pd.read_csv(csv_file)
    
    df = df.dropna(how='all')
    
    df['has_attachment'] = False  

    df.loc[df['label'] == 'PO', 'has_attachment'] = True
    
    non_po_indices = df[df['label'] == 'Non-PO'].index
    non_po_true_indices = df[df['label'] == 'Non-PO'].sample(frac=0.2, random_state=42).index
    df.loc[non_po_true_indices, 'has_attachment'] = True

    
    
    texts = df['text'].tolist()  
    labels = df['label'].tolist()
    flags=df['has_attachment'].tolist()
    
    return texts, labels, flags


In [33]:
from torch.utils.data import Dataset
import torch

class EmailDataset(Dataset):
    def __init__(self, texts, labels, flags, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.flags = flags 
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        
        text = self.texts[idx]
        label = self.labels[idx]
        flag = self.flags[idx]

        inputs = self.tokenizer(
            text, 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length, 
            return_tensors="pt"
        )
        
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        
        return {
            **inputs, 
            'labels': torch.tensor(label, dtype=torch.long),  # Convert label to tensor
            'flags': torch.tensor(flag, dtype=torch.bool)  # Convert flag to tensor
        }


In [34]:
train_csv="E:\Projects\GenAI\AI-Powered Automated PO Tool\AI-Powered-Purchase-Order-Parser\Train Data.csv"
test_csv="E:\Projects\GenAI\AI-Powered Automated PO Tool\AI-Powered-Purchase-Order-Parser\Test Data.csv"

In [35]:
train_texts, train_labels, train_flags = load_data_from_csv(train_csv)
test_texts, test_labels, test_flags = load_data_from_csv(test_csv)

In [36]:
label_mapping = {"PO": 1, "Non-PO": 0}  # Example mapping
train_labels = [label_mapping[label] for label in train_labels]
test_labels = [label_mapping[label] for label in test_labels]



In [37]:
train_dataset = EmailDataset(train_texts, train_labels, train_flags, tokenizer)
test_dataset = EmailDataset(test_texts, test_labels, test_flags, tokenizer)

In [38]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    save_total_limit=3,
)



In [None]:
#trainer = Trainer(
#    model=model,
#    args=training_args,
#   train_dataset=train_dataset,
#    eval_dataset=test_dataset
#)

#trainer.train()

#model.save_pretrained('./po_email_model')

In [40]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_path = "./po_email_model" 
usable_model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [41]:
'''def classify_text(text):
    # Tokenize the input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Get input ids and attention mask
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Predict using the model
    with torch.no_grad():
        outputs = usable_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()  # Get the predicted class

    return prediction

# Get custom input text
input_text = input("Enter the text to classify: ")

# Classify the input text
predicted_class = classify_text(input_text)
print(f"Predicted Class: {predicted_class}")
'''

'def classify_text(text):\n    # Tokenize the input text\n    encoding = tokenizer.encode_plus(\n        text,\n        add_special_tokens=True,\n        max_length=128,\n        padding=\'max_length\',\n        truncation=True,\n        return_tensors=\'pt\'\n    )\n\n    # Get input ids and attention mask\n    input_ids = encoding[\'input_ids\']\n    attention_mask = encoding[\'attention_mask\']\n\n    # Predict using the model\n    with torch.no_grad():\n        outputs = usable_model(input_ids=input_ids, attention_mask=attention_mask)\n        logits = outputs.logits\n        prediction = torch.argmax(logits, dim=1).item()  # Get the predicted class\n\n    return prediction\n\n# Get custom input text\ninput_text = input("Enter the text to classify: ")\n\n# Classify the input text\npredicted_class = classify_text(input_text)\nprint(f"Predicted Class: {predicted_class}")\n'

Email Classifier

In [42]:
def classify_email(text, usable_model, tokenizer):
    inputs = tokenize_input(text, tokenizer)
    outputs = usable_model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    return "PO Email" if prediction == 1 else "Non-PO Email"

In [43]:
import pdfplumber
import os

def extract_text_from_pdf(pdf_content):

    temp_file = "temp.pdf"
    try:
        
        with open(temp_file, "wb") as f:
            f.write(pdf_content)
        
        with pdfplumber.open(temp_file) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""  
        

        lines = text.split('\n')
        
        
        if _is_table_like(lines):
            return _format_as_table(lines)
        else:
            return '\n'.join(lines)
    
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None
    
    finally:
        
        if os.path.exists(temp_file):
            os.remove(temp_file)


def _is_table_like(lines):
    
    return all('\t' in line for line in lines)

def _format_as_table(lines):

    return [line.split('\t') for line in lines]


In [44]:
import easyocr
from PIL import Image
import io
import numpy as np

def extract_text_from_image(image_content):
    # Load the image from byte content
    image = Image.open(io.BytesIO(image_content))
    
    # Initialize EasyOCR Reader
    reader = easyocr.Reader(['en'])  # Supports multiple languages, 'en' for English
    
    # Perform OCR and get the result
    result = reader.readtext(image)
    
    # Sort results by vertical and horizontal position
    result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
    
    # Group texts into lines and detect table-like structures
    lines = []
    current_line = []
    current_y = result[0][0][0][1]
    
    for item in result:
        text = item[1]
        top_left = item[0][0]
        
        # Check if the item is in the same line (within a small vertical threshold)
        if abs(top_left[1] - current_y) < 20:
            current_line.append((top_left[0], text))
        else:
            # Sort the current line by horizontal position
            current_line.sort(key=lambda x: x[0])
            lines.append(' '.join(item[1] for item in current_line))
            current_line = [(top_left[0], text)]
            current_y = top_left[1]
    
    # Add the last line
    if current_line:
        current_line.sort(key=lambda x: x[0])
        lines.append(' '.join(item[1] for item in current_line))
    
    # Detect if the extracted text resembles a table
    if _is_table_like(lines):
        return _format_as_table(lines)
    else:
        return '\n'.join(lines)

def _is_table_like(lines):
    # Heuristics to detect if the text looks like a table
    delimiter_count = sum(1 for line in lines if '|' in line or '\t' in line)
    return delimiter_count > len(lines) * 0.5

def _format_as_table(lines):
    # Try to create a more structured table representation
    table_lines = []
    for line in lines:
        # Split by multiple spaces or tabs
        parts = [p.strip() for p in line.replace('|', '\t').split('\t') if p.strip()]
        table_lines.append('\t'.join(parts))
    
    return '\n'.join(table_lines)


In [None]:
def email_classification_pipeline(username, password, imap_server, usable_model, tokenizer):
    client = connect_to_email(username, password, imap_server)
    
    for message in fetch_emails(client):
        
        subject, body, email_attachments, attachment_flag = process_email(message)
        
        combined_text = prepare_input(subject, body, attachment_flag)
        
        result = classify_email(combined_text, usable_model, tokenizer)
        
        print(f"Email classified as: {result}")
        
        if result == 'PO Email':  
            for filename, attachment in email_attachments:

                if filename.lower().endswith(".pdf"):
                    print(f"Processing PDF attachment: {filename}")
                    order_details = extract_text_from_pdf(attachment)
                    print(f"Extracted order details: {order_details}")
                
                elif filename.lower().endswith((".jpg", ".jpeg", ".png")):
                    print(f"Processing image attachment: {filename}")
                    order_details = extract_text_from_image(attachment)
                    print(f"Extracted order details: {order_details}")
        else:
            print(f"Skipping attachment processing for non-PO email: {subject}")


In [None]:
email_classification_pipeline("you_email_id@gmail.com", "app_password", "imap.gmail.com", usable_model, tokenizer)

Email classified as: Non-PO Email
Processing PDF attachment: pdf PO.pdf
Extracted order details: PURCHASE ORDER
[Company Name]
[Street Address] DATE 9/29/2015
[City, ST ZIP] PO # [123456]
Phone: (000) 000-0000
Fax: (000) 000-0000
Website:
VENDOR SHIP TO
[Company Name] [Name]
[Contact or Department] [Company Name]
[Street Address] [Street Address]
[City, ST ZIP] [City, ST ZIP]
Phone: (000) 000-0000 [Phone]
Fax: (000) 000-0000
REQUISITIONER SHIP VIA F.O.B. SHIPPING TERMS
ITEM # DESCRIPTION QTY UNIT PRICE TOTAL
[23423423] Product XYZ 15 150.00 2,250.00
[45645645] Product ABC 1 75.00 75.00
SUBTOTAL 2,325.00
Comments or Special Instructions TAX
Thank you for your business. SHIPPING
OTHER
TOTAL $2,325.00
If you have any questions about this purchase order,please contact
https://www.vertex42.com/ExcelTemplates/excel-purchase-order.html Purchase Order Template Â© 2015 Vertex42.com[Name, Phone #, E-mail]
https://www.vertex42.com/ExcelTemplates/excel-purchase-order.html Purchase Order Template Â