In [1]:
import os
import email
from email import policy
from pathlib import Path


def extract_email_details(mail_path: Path, attachments_folder: Path) -> dict:
    # Read the .eml file
    with open(mail_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    sender = msg.get('From')
    reciver = msg.get('To')
    subject = msg.get('Subject')
    date = msg.get('Date')
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()
    attachments = []
    
    # Extract attachments and save them
    for part in msg.iter_attachments():
        filename = part.get_filename()
        if filename:
            attachments.append(filename)
            with open(attachments_folder / filename, "wb") as f:
                f.write(part.get_payload(decode=True))
    
    return {
        "Sender": sender,
        "Reciver": reciver,
        "Subject": subject,
        "Date": date,
        "Body": body,
        "Attachments": attachments
    }

In [39]:
# Example usage
from config.settings import DATA_DIR
mails_dir = DATA_DIR / 'mails'
mail_path = mails_dir / os.listdir(mails_dir)[4]
mail_name = mail_path.name.split('.')[0]

attachments_folder = Path.cwd() / 'attachments' / mail_name

# Ensure the parent exist
attachments_folder.mkdir(exist_ok=True)

email_details = extract_email_details(mail_path, attachments_folder)
print(email_details['Body'])

<https://gbr01.safelinks.protection.outlook.com/?url=https%3A%2F%2Furl.uk.m.mimecastprotect.com%2Fs%2FHtj4CBP78t70q7Liz1XTh%3Fdomain%3Dlidl.co.uk&data=05%7C02%7CKegan.Stubbs%40fpp-ltd.com%7C3f5853b583c449db64c308dc920a5fb4%7C0a6ac99f713640dd9ec3ddedf46edbc9%7C0%7C0%7C638545820449983012%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=W13OnPiBNQpPxm1G3%2BAQ3XBQVSuqBjaOkhN7rqvCt7I%3D&reserved=0>
21 June 2024

Our Ref: 103927724
Dear Sir or Madam



Please be aware we have received a customer complaint regarding a product you supply - see the details below.

Please note, this is for your information only and no action is required at this time. However, it is still your responsibility to trend your own complaints.


If you believe this to be a wider issue and not an isolated incident, we request you immediately inform us by contacting the qualityassurance@lidl.co.uk<https://gbr01.safelinks.protection.outlook.com/?url=https%3A%2F%

In [13]:
def get_prompt(text):
    prompt = """
    You are a **product quality notification data extractor**. Your task is to extract structured rejection details from the provided text and return **only the JSON output** in the following format:

    {
        "ID": "integer",
        "NotificationType": "string",
        "DateTime": "YYYY-MM-DD HH:MM:SS",
        "Location": "string",
        "Category": "string",
        "Product": "string",
        "FocusProduct": "string",
        "Supplier": "string",
        "Traceability": "string",
        "DateCode": "string",
        "DeliverySize": "string",
        "CasesSampled": "integer",
        "CasesRejected": "integer",
        "UnitsRejected": "integer",
        "RejectionLevel": "string",
        "Details": "string",
        "ActionTaken": "string",
        "ContactsInformed": "string"
    }

    **Strict Output Rules:**
    - **Only return the data in JSON format.**
    - **Do not include any explanation, additional text, or non-JSON output.**
    - Ensure all dates follow the format `YYYY-MM-DD HH:MM:SS`.
    - If any value is missing, set it to `null`. Do not infer missing details or hallucinate.
    - **Strictly follow the data types and format specified.**
    - If there are any links, such as images or external references, exclude them from the output.

    ### Rejection Data Text:
    """ + text

    return prompt

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model with 4-bit quantization
def load_model(model_name="Qwen/Qwen2.5-14B-Instruct-1M"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",  # Automatically map to available devices
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    return model, tokenizer

# Load model and tokenizer with 4-bit quantization
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [31]:
prompt = get_prompt(email_details['Body'])
messages = [
    {"role": "system", "content": "You are email checker."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids)[0]

In [32]:
print(response)

```json
{
    "ID": 111,
    "DateTime": "2024-08-27 06:57:00",
    "Location": "Erith CFC (Kent, DA8 1DE)",
    "Category": "TOMATOES",
    "Product": "629903011: Ocado Organic Tomatoes on the Vine",
    "FocusProduct": null,
    "Supplier": "ETHICAL FOOD COMPANY LTD",
    "Traceability": "Date Code: 2024-09-03",
    "DateCode": "2024-09-03",
    "DeliverySize": "10 x 16",
    "CasesSampled": 3,
    "CasesRejected": null,
    "UnitsRejected": null,
    "RejectionLevel": "None",
    "Details": "1x Rot- Progressive defect not acceptable to customers. No QAS available.",
    "ActionTaken": null,
    "ContactsInformed": null
}
```<|im_end|>


In [33]:
import re, json
# Extract JSON using regex
match = re.search(r"\{.*\}", response, re.DOTALL)
if match:
    json_str = match.group(0).strip()
    try:
        data = json.loads(json_str)
        print("Parsed JSON:")
        print(data)
    except json.JSONDecodeError as e:
        print("JSON decoding error:", e)
else:
    print("No JSON found in the response")

Parsed JSON:


In [34]:
from json2html import json2html
from IPython.display import display, HTML

# Convert JSON to an HTML table
html_table = json2html.convert(json=data)

# Display the table
display(HTML(html_table))

0,1
ID,111
NotificationType,Warning
DateTime,2024-08-27 06:57:00
Location,"Erith CFC (Kent, DA8 1DE)"
Category,TOMATOES
Product,629903011: Ocado Organic Tomatoes on the Vine
FocusProduct,
Supplier,ETHICAL FOOD COMPANY LTD
Traceability,Date Code: 2024-09-03
DateCode,2024-09-03


In [35]:
# save json
output_mail_dir = DATA_DIR / 'output_mail_dir'
output_mail_dir.mkdir(exist_ok=True)

import json
# Define full file path
json_file_path = os.path.join(output_mail_dir, f"{mail_name}.json")

# Save JSON file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4)

print(f"JSON saved at: {json_file_path}")

