In [1]:
import os
import email
from email import policy
from pathlib import Path


def extract_email_details(mail_path: Path, attachments_folder: Path) -> dict:
    # Read the .eml file
    with open(mail_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    sender = msg.get('From')
    reciver = msg.get('To')
    subject = msg.get('Subject')
    date = msg.get('Date')
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()
    attachments = []
    
    # Extract attachments and save them
    for part in msg.iter_attachments():
        filename = part.get_filename()
        if filename:
            attachments.append(filename)
            with open(attachments_folder / filename, "wb") as f:
                f.write(part.get_payload(decode=True))
    
    return {
        "Sender": sender,
        "Reciver": reciver,
        "Subject": subject,
        "Date": date,
        "Body": body,
        "Attachments": attachments
    }

In [22]:
# Example usage
from config.settings import DATA_DIR
mails_dir = DATA_DIR / 'mails'

# valid indices 1, 2, 5 - currently working on.
mail_path = mails_dir / os.listdir(mails_dir)[1]
mail_name = mail_path.name.split('.')[0]

attachments_folder = Path.cwd() / 'attachments' / mail_name

# Ensure the parent exist
attachments_folder.mkdir(exist_ok=True)

email_details = extract_email_details(mail_path, attachments_folder)
print(email_details['Body'])

M&amp;S Product Quality Notification - Product Escalation

ID      7735
Notification Type       Product Escalation
Date / Time     14/08/2024 13:59
Source  FSL
Submitted By    R. Pittala
Location        RDC: Crewe
Category        Fruit - Stone Fruit
Product 00082723: YELLOW NECTARINES X4
Focus Product   No
Supplier        4793: DIRECT PRODUCE SUPPLIES
Traceability    Variety: Honey Glo
Grower: N/A
COO: Spain
Label Date Code: 2024-08-19
Product Temp: 2.8
Storage Temp: 4

Delivery Size (cases x units)   69 x 10
Cases Sampled   20
Units OOS (OOS / Sampled)       32/40
Units Removed / Destructively Tested    0
Cases Rejected  69/69
Units Rejected  690/690
Rejection Level Full
Details During the inspection of 40 packs of PR yellow nectarines (82723) from a total of 20 cases (100 packs), several issues were discovered. - 20 fruits exhibited internal breakdown and were soft. - 10 packs contained internal rots, glassy, and mushy texture. - 2 packs displayed external storage mold. The major def

In [3]:
def get_prompt(text):
    prompt = """
    You are a **product quality notification data extractor**. Your task is to extract structured rejection details from the provided text and return **only the JSON output** in the following format:

    {
        "ID": "integer",
        "NotificationType": "string",
        "DateTime": "YYYY-MM-DD HH:MM:SS",
        "Location": "string",
        "Category": "string",
        "Product": "string",
        "FocusProduct": "string",
        "Supplier": "string",
        "CountryOfOrigin": "string",
        "Variety": "string",
        "Grower": "string",
        "PackagingCodes": "string",
        "DateCode": "YYYY-MM-DD",
        "DeliverySize": "string",
        "CasesSampled": "integer",
        "CasesRejected": "integer",
        "UnitsRejected": "integer",
        "RejectionLevel": "string",
        "Details": "string",
        "ActionTaken": "string",
        "ContactsInformed": "string"
    }

    **Metadata for Fields:**
    - **ID** *(integer)* – A unique identifier for each quality notification.
    - **NotificationType** *(string)* – The type of notification (e.g., Product Escalation, High / Rejection, Warning).
    - **DateTime** *(string - YYYY-MM-DD HH:MM:SS)* – The timestamp when the notification was created.
    - **Location** *(string)* – The facility or depot where the issue was identified.
    - **Category** *(string)* – The product category (e.g., Fruit - Stone Fruit, Produce - Citrus).
    - **Product** *(string)* – The product name or SKU associated with the issue.
    - **FocusProduct** *(string)* – Whether the product is a focus product (Yes/No).
    - **Supplier** *(string)* – Supplier information, including supplier ID if available.
    - **CountryOfOrigin** *(string)* – The country where the product was grown or manufactured.
    - **Variety** *(string)* – The specific variety of the product (e.g., Gala Apples, Valencia Oranges).
    - **Grower** *(string)* – The grower or farm responsible for the product.
    - **PackagingCodes** *(string)* – Packaging-related codes used for traceability.
    - **DateCode** *(string - YYYY-MM-DD)* – The date code printed on the product.
    - **DeliverySize** *(string)* – The number of cases/trays received and their respective units (e.g., 69 x 10).
    - **CasesSampled** *(integer)* – The number of cases that were sampled during inspection.
    - **CasesRejected** *(integer)* – The number of cases that were fully rejected.
    - **UnitsRejected** *(integer)* – The total number of rejected units.
    - **RejectionLevel** *(string)* – The level of rejection (e.g., Full, Partial, None).
    - **Details** *(string)* – A description of the issue found during the inspection.
    - **ActionTaken** *(string)* – The action taken as a result of the rejection (e.g., product rejected on site).
    - **ContactsInformed** *(string)* – A list of people or emails who were informed about the rejection.

    **Strict Output Rules:**
    - **Only return the data in JSON format.**
    - **Do not include any explanation, additional text, or non-JSON output.**
    - Ensure all dates follow the format `YYYY-MM-DD HH:MM:SS`.
    - If any value is missing, set it to `null`. **Do not infer missing details or hallucinate.**
    - **Strictly follow the data types and format specified.**
    - If there are any links, such as images or external references, exclude them from the output.

    ### Rejection Data Text:
    """ + text

    return prompt

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model with 4-bit quantization
def load_model(model_name="Qwen/Qwen2.5-14B-Instruct-1M"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",  # Automatically map to available devices
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    return model, tokenizer

# Load model and tokenizer with 4-bit quantization
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [23]:
prompt = get_prompt(email_details['Body'])
messages = [
    {"role": "system", "content": "You are email checker."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids)[0]

In [24]:
print(response)

```json
{
    "ID": 7735,
    "NotificationType": "Product Escalation",
    "DateTime": "2024-08-14 13:59:00",
    "Location": "RDC: Crewe",
    "Category": "Fruit - Stone Fruit",
    "Product": "00082723: YELLOW NECTARINES X4",
    "FocusProduct": "No",
    "Supplier": "4793: DIRECT PRODUCE SUPPLIES",
    "CountryOfOrigin": "Spain",
    "Variety": "Honey Glo",
    "Grower": "N/A",
    "PackagingCodes": null,
    "DateCode": "2024-08-19",
    "DeliverySize": "69 x 10",
    "CasesSampled": 20,
    "CasesRejected": 69,
    "UnitsRejected": 690,
    "RejectionLevel": "Full",
    "Details": "During the inspection of 40 packs of PR yellow nectarines (82723) from a total of 20 cases (100 packs), several issues were discovered. - 20 fruits exhibited internal breakdown and were soft. - 10 packs contained internal rots, glassy, and mushy texture. - 2 packs displayed external storage mold. The major defect percentage is calculated at 9.7%. Combined defect percentage is at 30%. The specification 

In [25]:
import re, json
# Extract JSON using regex
match = re.search(r"\{.*\}", response, re.DOTALL)
if match:
    json_str = match.group(0).strip()
    try:
        data = json.loads(json_str)
        print("Parsed JSON:")
        print(data)
    except json.JSONDecodeError as e:
        print("JSON decoding error:", e)
else:
    print("No JSON found in the response")

Parsed JSON:
{'ID': 7735, 'NotificationType': 'Product Escalation', 'DateTime': '2024-08-14 13:59:00', 'Location': 'RDC: Crewe', 'Category': 'Fruit - Stone Fruit', 'Product': '00082723: YELLOW NECTARINES X4', 'FocusProduct': 'No', 'Supplier': '4793: DIRECT PRODUCE SUPPLIES', 'CountryOfOrigin': 'Spain', 'Variety': 'Honey Glo', 'Grower': 'N/A', 'PackagingCodes': None, 'DateCode': '2024-08-19', 'DeliverySize': '69 x 10', 'CasesSampled': 20, 'CasesRejected': 69, 'UnitsRejected': 690, 'RejectionLevel': 'Full', 'Details': 'During the inspection of 40 packs of PR yellow nectarines (82723) from a total of 20 cases (100 packs), several issues were discovered. - 20 fruits exhibited internal breakdown and were soft. - 10 packs contained internal rots, glassy, and mushy texture. - 2 packs displayed external storage mold. The major defect percentage is calculated at 9.7%. Combined defect percentage is at 30%. The specification tolerance for major defects is 2%. Total cases received: 69.', 'ActionTa

In [26]:
from json2html import json2html
from IPython.display import display, HTML

# Convert JSON to an HTML table
html_table = json2html.convert(json=data)

# Display the table
display(HTML(html_table))

0,1
ID,7735
NotificationType,Product Escalation
DateTime,2024-08-14 13:59:00
Location,RDC: Crewe
Category,Fruit - Stone Fruit
Product,00082723: YELLOW NECTARINES X4
FocusProduct,No
Supplier,4793: DIRECT PRODUCE SUPPLIES
CountryOfOrigin,Spain
Variety,Honey Glo


In [27]:
# save json
output_mail_dir = DATA_DIR / 'output_mail_dir'
output_mail_dir.mkdir(exist_ok=True)

import json
# Define full file path
json_file_path = os.path.join(output_mail_dir, f"{mail_name}.json")

# Save JSON file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4)

print(f"JSON saved at: {json_file_path}")

JSON saved at: C:\Users\ai.users\imscanCS\data\output_mail_dir\FSL - M&S Quality Notification - Product Escalation - RDC_ Crewe.json
