In [1]:
import os
import email
from email import policy
from pathlib import Path


def extract_email_details(mail_path: Path, attachments_folder: Path) -> dict:
    # Read the .eml file
    with open(mail_path, "rb") as f:
        msg = email.message_from_binary_file(f, policy=policy.default)
    
    sender = msg.get('From')
    reciver = msg.get('To')
    subject = msg.get('Subject')
    date = msg.get('Date')
    body = msg.get_body(preferencelist=('plain', 'html')).get_content()
    attachments = []
    
    # Extract attachments and save them
    for part in msg.iter_attachments():
        filename = part.get_filename()
        if filename:
            attachments.append(filename)
            with open(attachments_folder / filename, "wb") as f:
                f.write(part.get_payload(decode=True))
    
    return {
        "Sender": sender,
        "Reciver": reciver,
        "Subject": subject,
        "Date": date,
        "Body": body,
        "Attachments": attachments
    }

In [2]:
# Example usage
from config.settings import DATA_DIR
mails_dir = DATA_DIR / 'mails'
mail_path = mails_dir / os.listdir(mails_dir)[1]
mail_name = mail_path.name.split('.')[0]

attachments_folder = Path.cwd() / 'attachments' / mail_name

# Ensure the parent exist
attachments_folder.mkdir(exist_ok=True)

email_details = extract_email_details(mail_path, attachments_folder)
print(email_details['Body'])

M&amp;S Product Quality Notification - Product Escalation

ID      7735
Notification Type       Product Escalation
Date / Time     14/08/2024 13:59
Source  FSL
Submitted By    R. Pittala
Location        RDC: Crewe
Category        Fruit - Stone Fruit
Product 00082723: YELLOW NECTARINES X4
Focus Product   No
Supplier        4793: DIRECT PRODUCE SUPPLIES
Traceability    Variety: Honey Glo
Grower: N/A
COO: Spain
Label Date Code: 2024-08-19
Product Temp: 2.8
Storage Temp: 4

Delivery Size (cases x units)   69 x 10
Cases Sampled   20
Units OOS (OOS / Sampled)       32/40
Units Removed / Destructively Tested    0
Cases Rejected  69/69
Units Rejected  690/690
Rejection Level Full
Details During the inspection of 40 packs of PR yellow nectarines (82723) from a total of 20 cases (100 packs), several issues were discovered. - 20 fruits exhibited internal breakdown and were soft. - 10 packs contained internal rots, glassy, and mushy texture. - 2 packs displayed external storage mold. The major def

In [7]:
def get_prompt(text):
    prompt = """
    You are a **quality rejection data extractor**. Your task is to extract structured rejection details from the provided text and return **only the JSON output** in the following format:

    {
        "UniqueID": "integer",
        "RejectionDate": "YYYY-MM-DD",
        "Week": "integer",
        "Customer": "string",
        "Product": "string",
        "Brand": "string",
        "QuantityRejectedCases": "integer",
        "CountryOfOrigin": "string",
        "Variety": "string",
        "Producer": "string",
        "UKPackhouse": "string",
        "Packsize": "string",
        "ProductCode": "string",
        "Reason": "string",
        "Comment": "string",
        "Year": "integer"
    }

    **Strict Output Rules:**
    - **Only return the data in JSON format.**
    - **Do not include any explanation, additional text, or non-JSON output.**
    - Ensure all dates follow the format `YYYY-MM-DD`.
    - If any value is missing, set it to `null`. Do not infer missing details or hallucinate.
    - **Strictly follow the data types and format specified.**
    - The output must be a valid JSON object without any additional markdown or text.

    ### Rejection Data Text:

    """ + text

    return prompt

In [4]:
import torch
import tensorflow as tf
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model with 4-bit quantization
def load_model(model_name="Qwen/Qwen2.5-3B-Instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cuda",  # Automatically map to available devices
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    return model, tokenizer

# Example: Use 4-bit quantized model to process an invoice
# Load model and tokenizer with 4-bit quantization
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(email_details['Body'][:1280])

M&amp;S Product Quality Notification - Product Escalation

ID      7735
Notification Type       Product Escalation
Date / Time     14/08/2024 13:59
Source  FSL
Submitted By    R. Pittala
Location        RDC: Crewe
Category        Fruit - Stone Fruit
Product 00082723: YELLOW NECTARINES X4
Focus Product   No
Supplier        4793: DIRECT PRODUCE SUPPLIES
Traceability    Variety: Honey Glo
Grower: N/A
COO: Spain
Label Date Code: 2024-08-19
Product Temp: 2.8
Storage Temp: 4

Delivery Size (cases x units)   69 x 10
Cases Sampled   20
Units OOS (OOS / Sampled)       32/40
Units Removed / Destructively Tested    0
Cases Rejected  69/69
Units Rejected  690/690
Rejection Level Full
Details During the inspection of 40 packs of PR yellow nectarines (82723) from a total of 20 cases (100 packs), several issues were discovered. - 20 fruits exhibited internal breakdown and were soft. - 10 packs contained internal rots, glassy, and mushy texture. - 2 packs displayed external storage mold. The major def

In [8]:
prompt = get_prompt(email_details['Body'][:1280])
messages = [
    {"role": "system", "content": "You are email checker."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids)[0]

In [9]:
print(response)

 finaljson={
         "UniqueID":7735,
         "RejectionDate":"2024-08-14",
         "Week":77,
         "Customer":"M&S",
         "Product":"00082723:YELLOW NECTARINES X4",
         "Brand":null,
         "QuantityRejectedCases":69,
         "CountryOfOrigin":"Spain",
         "Variety":"Honey Glo",
         "Producer":null,
         "UKPackhouse":null,
         "Packsize":"x10",
         "ProductCode":"00082723:YELLOW NECTARINES X4",
         "Reason":"internal breakdown and were soft, internal rots, glassy, and mushy texture, external storage mold",
        alchemy",
         "Comment":"The product has been rejected on site.",
         "Year":2024
     }<|im_end|>


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = get_prompt(email_details['Body'][:1280])
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


RuntimeError: Failed to import transformers.models.qwen2.modeling_qwen2 because of the following error (look up to see its traceback):
module 'tensorflow.python.data.ops.dataset_ops' has no attribute 'DatasetV2'

In [None]:
print(response)

A large language model (LLM) is a type of artificial intelligence designed to understand and generate human-like text based on vast amounts of textual data it has been trained on. These models are trained using advanced machine learning techniques, such as deep learning, where they learn patterns and relationships within the training data to predict the next word or sequence of words in a text.

Key features of LLMs include:

1. **Training Data**: They are typically trained on massive datasets, often containing billions of words from various sources like books, articles, web pages, and more.

2. **Contextual Understanding**: LLMs can process and understand the context in which words are used, allowing them to provide relevant and coherent responses to questions or prompts.

3. **Adaptability**: They can adapt to different topics and domains, making them versatile tools for various applications, including but not limited to natural language processing tasks, writing assistance, customer