In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
!pip install wandb

In [None]:
import pandas as pd
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import wandb
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login


from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [None]:
model = "NousResearch/Llama-2-7b-chat-hf"
MODEL_NAME = model


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


model = prepare_model_for_kbit_training(model)


In [None]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)


def get_last_layer_linears(model):
    names = []

    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names


In [None]:
config = LoraConfig(
    r=2,
    lora_alpha=32,
    target_modules=get_last_layer_linears(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = get_peft_model(model, config)


In [None]:
df = pd.read_csv("/content/tester2.csv")
df.columns = [str(q).strip() for q in df.columns]





In [None]:

import re

# Entity unit map provided in the input
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Function to extract valid entity value
def extract_entity_value(ocr_text, entity_name):
    # Join OCR text into a single string
    ocr_text_str = ' '.join(ocr_text)

    # Get the allowed units for the entity_name
    allowed_units = entity_unit_map.get(entity_name, [])

    # Regular expression to capture valid entity values with allowed units
    pattern = r"(\d+\.?\d*)\s?(" + "|".join(allowed_units) + r")\b"

    # Search for matching patterns in the OCR text
    matches = re.findall(pattern, ocr_text_str)

    # Return formatted entity value if found, else an empty string
    if matches:
        # Concatenate the first valid match found
        return f"{matches[0][0]} {matches[0][1]}"
    else:
        return ""




# Create a new column for cleaned OCR text
df['clean_ocr_text'] = df['ocr_text'].apply(lambda x: ' '.join(x))

# Apply the extraction function to get the entity value
df['entity_value_extracted'] = df.apply(lambda row: extract_entity_value(row['ocr_text'], row['entity_name']), axis=1)

# Generate the output prompt format
df['question'] = df.apply(lambda row: f"For {row['group_id']} please find the {row['entity_name']} in this text {row['clean_ocr_text'].strip()}.", axis=1)

# Show the final DataFrame


In [None]:
prompt = df["question"].values[0]

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 30
generation_config.temperature = 0.3
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id


In [None]:
def generate_answer(prompt):
    device = "cuda"
    encoding = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids = encoding.input_ids,
            attention_mask = encoding.attention_mask,
            generation_config = generation_config
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_output.split(prompt)[-1]  # Assumes the answer follows the prompt
    return answer.strip()




In [None]:
questions = [
    f"For group_id {row['group_id']}, find {row['entity_name']} in {row['ocr_text'].strip()}. Give the entity name in this format :  the entity value is = <entity_value>"
    for _, row in df.iterrows()
]

In [None]:
testing_answers = [generate_answer(question.strip()) for question in questions]




In [None]:
import re
import pandas as pd

# Sample text data
text_data = testing_answers

# Regular expressions to capture group_id, entity_value, and corresponding value
group_id_pattern = r"group_id\s*=\s*(\d+)"
entity_value_pattern = r"entity_value\s*=\s*([^\n]+)"
value_pattern = r"(width|depth|height|weight|voltage)\s*is\s*([\d\.]+[a-zA-Z]+)"

# Lists to store the extracted data
group_ids = []
entity_values = []
values = []

# Process each text block
for text in text_data:
    group_id_match = re.search(group_id_pattern, text)
    entity_value_match = re.search(entity_value_pattern, text)
    value_match = re.search(value_pattern, text)

    # Extract group_id, entity_value, and corresponding value
    if group_id_match and entity_value_match and value_match:
        group_ids.append(group_id_match.group(1))
        entity_values.append(entity_value_match.group(1))
        values.append(value_match.group(2))
    else:
        # Handle missing information
        group_ids.append(group_id_match.group(1) if group_id_match else "N/A")
        entity_values.append(entity_value_match.group(1) if entity_value_match else "N/A")
        values.append(value_match.group(2) if value_match else "N/A")

# Create a pandas DataFrame
df = pd.DataFrame({
    'group_id': group_ids,
    'entity_value': entity_values,
    'value': values
})

# Save the DataFrame to a CSV file
output_csv_path = 'extracted_entity_values.csv'
df.to_csv(output_csv_path, index=False)

print(f"Data successfully saved to {output_csv_path}")


In [None]:
df=pd.read_csv('/content/extracted_entity_values.csv')