In [1]:
import os
import email
from email import policy
from pathlib import Path


def extract_email_details(mail_path: Path, attachments_folder: Path) -> dict:
    # Read the .eml file
    with open(mail_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    sender = msg.get('From')
    reciver = msg.get('To')
    subject = msg.get('Subject')
    date = msg.get('Date')
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()
    attachments = []
    
    # Extract attachments and save them
    for part in msg.iter_attachments():
        filename = part.get_filename()
        if filename:
            attachments.append(filename)
            with open(attachments_folder / filename, "wb") as f:
                f.write(part.get_payload(decode=True))
    
    return {
        "Sender": sender,
        "Reciver": reciver,
        "Subject": subject,
        "Date": date,
        "Body": body,
        "Attachments": attachments
    }

In [66]:
# Example usage
from config.settings import DATA_DIR
mails_dir = DATA_DIR / 'mails'

# valid indices 1, 2, 5 - currently working on.
mail_path = mails_dir / os.listdir(mails_dir)[5]
mail_name = mail_path.name.split('.')[0]

attachments_folder = Path.cwd() / 'attachments' / mail_name

# Ensure the parent exist
attachments_folder.mkdir(exist_ok=True)

email_details = extract_email_details(mail_path, attachments_folder)
print(email_details['Body'])

        Some people who received this message don't often get email from quality@food-surveys.com. Learn why this is important<https://aka.ms/LearnAboutSenderIdentification>


Dear ETHICAL FOOD COMPANY LTD

On behalf of Ocado Retail Limited, please find below the inbound product quality report for a recent delivery into Erith CFC (Kent, DA8 1DE)


Please use this report and the flagged issues as a continuous improvement tool to ensure products are being delivered to the QAS.


ID       111
Date / Time      27/08/2024 06:57
Inspector        R. Howard
Location         Erith CFC (Kent, DA8 1DE)
Category         TOMATOES
Product SKU      629903011: Ocado Organic Tomatoes on the Vine
Focus Product    No
Supplier         ETHICAL FOOD COMPANY LTD
Traceability     Date Code: 2024-09-03

Delivery Size (trays x units)    10 x 16
Trays Sampled    3
Units OOS (OOS / Sampled)        1/48
Unit Compliance Percent  97.9%
Units Removed / Destructively Tested     1
Details  1x Rot- Progressive defect no

In [54]:
def get_prompt(text):
    prompt = """
    You are a **product quality notification data extractor**. Your task is to extract structured rejection details from the provided text and return **only the JSON output** in the following format:

    {
        "ID": "integer",
        "NotificationType": "string",
        "DateTime": "YYYY-MM-DD HH:MM:SS",
        "Location": "string",
        "Category": "string",
        "Product": "string",
        "FocusProduct": "string",
        "Supplier": "string",
        "CountryOfOrigin": "string",
        "Variety": "string",
        "Grower": "string",
        "PackagingCodes": "string",
        "Traceability": "string",
        "DateCode": "YYYY-MM-DD",
        "DeliverySize": "string",
        "CasesSampled": "integer",
        "CasesRejected": "integer",
        "UnitsRejected": "integer",
        "RejectionLevel": "string",
        "Details": "string",
        "ActionTaken": "string",
        "ContactsInformed": "string"
    }

    **Metadata for Fields:**
    - **ID** *(integer)* – A unique identifier for each quality notification.
    - **NotificationType** *(string)* – The type of notification (e.g., Product Escalation, High / Rejection, Warning).
    - **DateTime** *(string - YYYY-MM-DD HH:MM:SS)* – The timestamp when the notification was created.
    - **Location** *(string)* – The facility or depot where the issue was identified.
    - **Category** *(string)* – The product category (e.g., Fruit - Stone Fruit, Produce - Citrus).
    - **Product** *(string)* – The product name or SKU associated with the issue.
    - **FocusProduct** *(string)* – Whether the product is a focus product (Yes/No).
    - **Supplier** *(string)* – Supplier information, including supplier ID if available.
    - **CountryOfOrigin** *(string)* – The country where the product was grown or manufactured.
    - **Variety** *(string)* – The specific variety of the product (e.g., Gala Apples, Valencia Oranges).
    - **Grower** *(string)* – The grower or farm responsible for the product.
    - **PackagingCodes** *(string)* – Packaging-related codes used for traceability.
    - **Traceability** *(string)* – Additional traceability details, such as batch numbers or internal tracking codes.
    - **DateCode** *(string - YYYY-MM-DD)* – The date code printed on the product.
    - **DeliverySize** *(string)* – The number of cases/trays received and their respective units (e.g., 69 x 10).
    - **CasesSampled** *(integer)* – The number of cases that were sampled during inspection.
    - **CasesRejected** *(integer)* – The number of cases that were fully rejected.
    - **UnitsRejected** *(integer)* – The total number of rejected units.
    - **RejectionLevel** *(string)* – The level of rejection (e.g., Full, Partial, None).
    - **Details** *(string)* – A description of the issue found during the inspection.
    - **ActionTaken** *(string)* – The action taken as a result of the rejection (e.g., product rejected on site).
    - **ContactsInformed** *(string)* – A list of people or emails who were informed about the rejection.

    **Strict Output Rules:**
    - **Only return the data in JSON format.**
    - **Do not include any explanation, additional text, or non-JSON output.**
    - Ensure all dates follow the format `YYYY-MM-DD HH:MM:SS`.
    - If any value is missing, set it to `null`. **Do not infer missing details or hallucinate.**
    - **Strictly follow the data types and format specified.**
    - If there are any links, such as images or external references, exclude them from the output.

    ### Rejection Data Text:
    """ + text

    return prompt

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model with 4-bit quantization
def load_model(model_name="Qwen/Qwen2.5-14B-Instruct-1M"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",  # Automatically map to available devices
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    return model, tokenizer

# Load model and tokenizer with 4-bit quantization
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [67]:
prompt = get_prompt(email_details['Body'])
messages = [
    {"role": "system", "content": "You are email checker."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids)[0]

In [68]:
print(response)

```json
{
    "ID": 111,
    "DateTime": "2024-08-27 06:57:00",
    "Location": "Erith CFC (Kent, DA8 1DE)",
    "Category": "TOMATOES",
    "Product": "629903011: Ocado Organic Tomatoes on the Vine",
    "FocusProduct": "No",
    "Supplier": "ETHICAL FOOD COMPANY LTD",
    "CountryOfOrigin": null,
    "Variety": null,
    "Grower": null,
    "PackagingCodes": null,
    "Traceability": "Date Code: 2024-09-03",
    "DateCode": "2024-09-03",
    "DeliverySize": "10 x 16",
    "CasesSampled": 3,
    "CasesRejected": null,
    "UnitsRejected": null,
    "RejectionLevel": "None",
    "Details": "1x Rot- Progressive defect not acceptable to customers. No QAS available.",
    "ActionTaken": "N/A",
    "ContactsInformed": null
}
```<|im_end|>


In [69]:
import re, json
# Extract JSON using regex
match = re.search(r"\{.*\}", response, re.DOTALL)
if match:
    json_str = match.group(0).strip()
    try:
        data = json.loads(json_str)
        print("Parsed JSON:")
        print(data)
    except json.JSONDecodeError as e:
        print("JSON decoding error:", e)
else:
    print("No JSON found in the response")

Parsed JSON:


In [70]:
from json2html import json2html
from IPython.display import display, HTML

# Convert JSON to an HTML table
html_table = json2html.convert(json=data)

# Display the table
display(HTML(html_table))

0,1
ID,111
NotificationType,Warning
DateTime,2024-08-27 06:57:00
Location,"Erith CFC (Kent, DA8 1DE)"
Category,TOMATOES
Product,629903011: Ocado Organic Tomatoes on the Vine
FocusProduct,No
Supplier,ETHICAL FOOD COMPANY LTD
CountryOfOrigin,
Variety,


In [71]:
# save json
output_mail_dir = DATA_DIR / 'output_mail_dir'
output_mail_dir.mkdir(exist_ok=True)

import json
# Define full file path
json_file_path = os.path.join(output_mail_dir, f"{mail_name}.json")

# Save JSON file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4)

print(f"JSON saved at: {json_file_path}")

