In [None]:
import os
import re
import json
import openpyxl
from openpyxl import Workbook
import random
import numpy as np
import pandas as pd
import torch
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from typing import List, Dict, Any

In [None]:
# Fixing the random
seed = 66
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)  
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Setting huggingface hub access token for LLM
os.environ["TRANSFORMERS_OFFLINE"] = "1"

# Initial
zero_shot = """You are a polymer materials scientist. Your task is to extract specific information from the supplied context based on the question. The extracted information must be returned as a JSON object, including units and values. The outputs should only include the JSON object without any explanation or note. If the supplied context contains the asked values and units, the value corresponding to the JSON key must be replaced with the values and units that appear in the supplied text. If there is no value or unit, fill in the corresponding blank with null. The extracted information must be in the following format: {"melt temperature": {"unit": extracted unit, "value": [extracted value]}, "mold temperature": {"unit": extracted unit, "value": [extracted value]}, "injection speed": {"unit": extracted unit, "value": [extracted value]}, "injection pressure": {"unit": extracted unit, "value": [extracted value]}, "holding pressure": {"unit": extracted unit, "value": [extracted value]}, "holding time": {"unit": extracted unit, "value": [extracted value]}}"""
# 1273 tokens
few_shots = """You are a polymer materials scientist. Your task is to extract specific information from the supplied context based on the question. The extracted information must be returned as a JSON object, including units and values. The outputs should only include the JSON object without any explanation or note. If the supplied context contains the asked values and units, the value corresponding to the JSON key must be replaced with the values and units that appear in the supplied text. If there is no value or unit, fill in the corresponding blank with null. The extracted information must be in the following format: {"melt temperature": {"unit": extracted unit, "value": [extracted value]}, "mold temperature": {"unit": extracted unit, "value": [extracted value]}, "injection speed": {"unit": extracted unit, "value": [extracted value]}, "injection pressure": {"unit": extracted unit, "value": [extracted value]}, "holding pressure": {"unit": extracted unit, "value": [extracted value]}, "holding time": {"unit": extracted unit, "value": [extracted value]}}. Here are three examples:
User: question: In the following context, what are the values and units of melt temperature, mold temperature, injection rate, injection pressure, holding pressure, and holding time? Return as a JSON object. context: The Pellets was dried at 100\u2103 for 10 h by NER-S10. During injection molding, temperature of melt was set 230 \u2103, and mold temperature was from 40 to 60 \u2103. The holding temperature was 100 MPa and duration for 3-5 s.
Assistant: {"melt temperature": {"unit": "\u2103", "value": [230]}, "mold temperature": {"unit": "\u2103", "value": [40, 60]}, "injection rate": {"unit": null, "value": null}, "injection pressure": {"unit": null, "value": null}, "holding pressure": {"unit": "MPa", "value": [100]}, "holding time": {"unit": "s", "value": [3, 5]}}
User: question: In the following context, what are the values and units of melt temperature, mold temperature, injection rate, injection pressure, holding pressure, and holding time? Return as a JSON object. context: The holding pressure was controlled at 40 MPa for 5s based on the preliminary results. A machine-set injection pressure of 120 MPa was used for molding, and the injection rate was from 15 to 35 mm/s. Cooling time was 20, 22.5, and 25s. There were four key operating parameters that can affect the formation of polymeric parts.
Assistant: {"melt temperature": {"unit": null, "value": null}, "mold temperature": {"unit": null, "value": null}, "injection rate": {"unit": "mm/s", "value": [15, 35]}, "injection pressure": {"unit": null, "value": null}, "holding pressure": {"unit": "MPa", "value": [40]}, "holding time": {"unit": "s", "value": [5]}}
User: question: In the following context, what are the values and units of melt temperature, mold temperature, injection rate, injection pressure, holding pressure, and holding time? Return as a JSON object. context: The materials is characterized by a melt flow index of 3g/10min (2.16kg, 230\u2103, ISO 1133), a weight-average molecular weight approx. 320000 (GPC) and an isotacticity index of 98% (ISO 9113). The morphology of both the \u03B1 and \u03B2-iPP specimens is insensitive to holding pressure changes from 50 to 70 MPa.
Assistant: {"melt temperature": {"unit": null, "value": null}, "mold temperature": {"unit": null, "value": null}, "injection rate": {"unit": null, "value": null}, "injection pressure": {"unit": null, "value": null}, "holding pressure": {"unit": "MPa", "value": [50]}, "holding time": {"unit": null, "value": null}}"""

# query = "In the following context, what are the values and units of melt temperature, mold temperature, injection speed, injection pressure, holding pressure, and holding time? Return as a JSON object."
query = "In the following context, what are the values and units of melt temperature, mold temperature, injection speed, injection pressure, holding pressure, and holding time? Return as a JSON object."

# Path for storage model outputs
output = "YOUR/OUTPUT/REPO/outputs.xlsx"

In [None]:
def load_llama2(pretrained_model: str, peft_model: str = None):

    # Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded.
    llama2_tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path = pretrained_model,
        padding_side = "left",
        local_files_only = True,
        token = os.environ.get("TRANSFORMERS_OFFLINE")
    )
    llama2_tokenizer.pad_token = llama2_tokenizer.eos_token
    llama2_tokenizer.add_bos_token = False

    # setting quantization params
    model_bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16,
        bnb_4bit_use_double_quant = False
    )

    # load model from huggingface.co (huggingface_hub)
    llama2_model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path = pretrained_model,
        quantization_config = model_bnb_config,
        local_files_only = True,
        token = os.environ.get("TRANSFORMERS_OFFLINE"),
        device_map = "auto",
        low_cpu_mem_usage = True
    )

    if peft_model is not None:
        llama2_model = PeftModel.from_pretrained(llama2_model, peft_model)

    return llama2_tokenizer, llama2_model

In [None]:
MODEL_ID = "YOUR/MODEL/REPO/llama-3.1-8B-Instruct-hf"
tokenizer, model = load_llama2(pretrained_model=MODEL_ID)
model.eval()

# (Skip when inference) Test performance of LLaMA2-7B-Chat

In [None]:
context = "PA6 was processed by NSR-2S and cycle of whole process is total 25s, the pellets were dried at 80\u2103 for 6h. During injection molding, a barrel temperature from 180 to 210\u2103, mould temperature was steady at 60\u2103, but holding pressure was set from 70 to 100 MPa for 5s."

chat = [
        {"role": "system", "content": f"{zero_shot}"},
        {"role": "user", "content": f"question: {query}\ncontext: {context}"}
    ]

In [None]:
input_tokens = tokenizer.apply_chat_template(
    chat,
    return_tensors="pt"
).to(model.device)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        input_tokens,
        max_new_tokens=512,
        temperature=0.0,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
    )

In [None]:
formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}

outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.01)
decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
# complete_output = formatted_chat + decoded_output
print(decoded_output)

# Branch I: Loading pdf as inputs

In [None]:
def load_embed_model(model_path: str) -> Any:
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        local_files_only = True,
        token=os.environ.get("TRANSFORMERS_OFFLINE")
    )
    model = AutoModel.from_pretrained(
        model_path,
        local_files_only = True,
        token=os.environ.get("TRANSFORMERS_OFFLINE")
    )
    return tokenizer, model


def pdf_parser(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()

        cleaned_text = re.sub(r'(?i)REFERENCES.*$', '', text, flags=re.DOTALL)
        replaced_text = re.sub(r'(8C|/C14C)', "\u2103", cleaned_text)

    return replaced_text

def check_illegal_chars(context: str) -> str:
        """Check for illegal characters in the content and replace them"""
        pattern = re.compile(r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]', re.UNICODE)
        illegal_chars_list = [(m.start(), m.group()) for m in pattern.finditer(context)]
        
        if illegal_chars_list:
            list_context = list(context)

            for pos, char in illegal_chars_list:
                list_context[pos] = f"[illegal character: {char}]"

            highlighted_context = ''.join(list_context)
            clean_context = re.sub(pattern, "", context)
            print(f"Highlight illegal characters:\n{highlighted_context}")
            print(f"after treatment:\n{clean_context}")

            return clean_context
        else:
            return context

def text_splitter(extracted_text: str, size: int, overlap: int) -> List[str]:
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", extracted_text)
    chunks = list()
    sent_idx = 0
    count = len(sentences)

    while sent_idx < count:
        chunk_end = min(sent_idx + size, count)
        chunk = " ".join(sentences[sent_idx:chunk_end])

        if re.search(r"\d", chunk):
            chunks.append(chunk.strip())

        sent_idx += (size - overlap)

    return chunks


def relevance_rerank(chunks: List[str], query: str, tokenizer: Any, model: Any) -> Any:
    query_embedded = __embedding_input([query], tokenizer, model)
    chunks_embedded = __embedding_input(chunks, tokenizer, model)

    similarity_scores = np.dot(query_embedded, chunks_embedded.T)

    db = pd.DataFrame({
        "similarity_score": similarity_scores[0],
        "context": chunks,
        "embedding": chunks_embedded.tolist()
    })

    db.sort_values(by="similarity_score", ascending=False, inplace=True)
    db.reset_index(drop=True, inplace=True)

    return db


def __embedding_input(text: List[str], tokenizer, model) -> Any:
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    embeddings = __mean_pooling(model_output, encoded_input['attention_mask'])
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

    return embeddings


def __mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
embed_tokenizer, embed_model = load_embed_model(model_path="YOUR/MODEL/REPO/all_MiniLM_L6_v2")  # dim 384

In [None]:
pdf_path = r"YOUR/PDF/REPO/2015-Elsevier.pdf"

In [None]:
extracted_text = pdf_parser(pdf_path=pdf_path)
chunks = text_splitter(extracted_text, 7, 1)
db = relevance_rerank(chunks, query, embed_tokenizer, embed_model)

In [None]:
db['context'] = db['context'].apply(check_illegal_chars)

# Branch II: Loading dataset as inputs

In [None]:
datasets_id = "YOUR/DATASET/REPO/ds_test.xlsx"
db = pd.read_excel(datasets_id)

# Inference

In [None]:
# Post-process outputs
def outputparser(text: str) -> Dict[str, Any]:
    result = re.search(r'<<SYS>>(.*?)<</SYS>>(.*?)question:(.*?)context:(.*?)\[/INST\](.*)', text, flags=re.DOTALL)

    if result:
        sys_info = result.group(1).strip()
        question = result.group(3).strip()
        context = result.group(4).strip()
        output = result.group(5).strip()
    else:
        sys_info = ""
        question = ""
        context = ""
        output = ""

    return {"system": sys_info, "user": {"question": question, "context": context}, "assistant": output}


# Storage outputs into Excel
def storage(excel_path: str, response: Dict[str, Any]) -> None:
    # Load the workbook if it exists, otherwise create a new one
    if os.path.exists(excel_path):
        workbook = openpyxl.load_workbook(excel_path)
        worksheet = workbook.active
    else:
        workbook = Workbook()
        worksheet = workbook.active
        worksheet.title = "Responses"

    # If the worksheet is empty, add headers
    if worksheet.max_row == 1 and worksheet.max_column == 1 and worksheet["A1"].value is None:
        headers = ["system", "question", "context", "assistant"]
        worksheet.append(headers)

    # Append the new row with response data
    new_row = [
        response["system"], 
        response["user"]["question"], 
        response["user"]["context"], 
        response["assistant"]
    ]
    worksheet.append(new_row)

    # Save the workbook
    workbook.save(excel_path)


# metric for evaluation
def compute_metrics(prediction: str, reference: str) -> float:
    try:
        ref_dict = json.loads(reference)
    except json.JSONDecodeError as e:
        print("Reference JSONDecodeError:", e)
    
    pattern = r"\{.*\}"
    match = re.search(pattern, prediction, re.DOTALL)
    
    if match:
        try:
            pre_dict = json.loads(match.group(0))
        except json.JSONDecodeError as e:
            print("Prediction JSONDecodeError:", e)
            return 0.0

        correct = 0
        incorrect = 0
        
        for key in ref_dict:
            if key in pre_dict:
                ref_unit = ref_dict[key].get('unit')
                ref_value = ref_dict[key].get('value')
                try:
                    pre_unit = pre_dict[key].get('unit')
                    pre_value = pre_dict[key].get('value')
                except AttributeError as e:
                    print(f"{key} AttributeError:", e)
                    pre_unit = False
                    pre_value = False

                if ref_unit == pre_unit and ref_value == pre_value:
                    correct += 1
                else:
                    incorrect += 1
            else:
                incorrect += 1
        
        total = correct + incorrect
        return correct / total
    else:
        return 0.0

In [None]:
print("=======================Task start!=======================")
count = 0
correct = 0
evaluation = list()

for i, context in enumerate(db["context"].head(10)):
    chat = [
        {"role": "system", "content": f"{zero_shot}"},
        {"role": "user", "content": f"question: {query}\ncontext: {context}"}
    ]

    formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
    inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
    
    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.01)
    decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
    complete_output = formatted_chat + decoded_output
    # print(complete_output)
    structured_output = outputparser(complete_output)
    # Storage response
    storage(excel_path=output, response=structured_output)
    
    # # Evaluation
    # score_i = compute_metrics(prediction=structured_output["assistant"], reference=db["output"][i])
    # evaluation.append(score_i)

    # if score_i == 1:
    #     correct += 1
    #     print(True)

    count += 1
    print(f"------------------task {count} complete!------------------")

# accuracy = correct/count
# print(f"Accuracy: {accuracy:.3f}")