In [1]:
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [55]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Retrieve the Hugging Face API key
api_key = os.getenv("key")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token=api_key)
tokenizer.use_default_system_prompt = False

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,  # Load in 8-bit mode
    use_auth_token="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"
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [4]:
import json

# Load the JSON file
with open("few-nerd.json", "r") as file:
    dataset = json.load(file)

# Inspect the dataset
print(dataset[0])


{'tokenized_text': ['In', 'the', 'early', '1930s', 'the', 'band', 'moved', 'to', 'the', 'Grill', 'Room', 'of', 'the', 'Taft', 'Hotel', 'in', 'New', 'York', ';', 'the', 'band', 'was', 'renamed', '``', 'George', 'Hall', 'and', 'His', 'Hotel', 'Taft', 'Orchestra', '``', '.'], 'ner': [[9, 10, 'building'], [13, 14, 'building'], [16, 17, 'location'], [24, 30, 'organization']]}


In [5]:
def format_json_dataset(data):
    formatted_data = []
    for entry in data:
        # Combine tokenized text into a coherent sentence
        text = " ".join(entry["tokenized_text"])
        
        # Create a prompt for NER
        prompt = f"Extract named entities from the following text:\n\"{text}\""
        
        # Ground truth entities
        entities = entry["ner"]
        formatted_data.append({"prompt": prompt, "entities": entities})
    
    return formatted_data

# Format the dataset
formatted_dataset = format_json_dataset(dataset)

# Inspect formatted data
print(formatted_dataset[0])


{'prompt': 'Extract named entities from the following text:\n"In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed `` George Hall and His Hotel Taft Orchestra `` ."', 'entities': [[9, 10, 'building'], [13, 14, 'building'], [16, 17, 'location'], [24, 30, 'organization']]}


In [49]:
ds=[]
for i in range(10):
    ds.append(formatted_dataset[i]["prompt"])


In [50]:
ds

['Extract named entities from the following text:\n"In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed `` George Hall and His Hotel Taft Orchestra `` ."',
 'Extract named entities from the following text:\n"The final season of minor league play Elkin Memorial Park saw season attendance of 16,322 , an average of 299 per contest ."',
 'Extract named entities from the following text:\n"They finished the season 14–19 , 9–9 in C-USA play to finish in seventh place ."',
 'Extract named entities from the following text:\n"The B-52 pilot , Major Larry G. Messinger , later recalled ,"',
 'Extract named entities from the following text:\n"The Austro-Hungarian Navy built and operated two classes of protected cruisers ."',
 'Extract named entities from the following text:\n"Elin Hilderbrand is an American writer mostly of romance novels ."',
 'Extract named entities from the following text:\n"A prototype was fitted in the mid-\'60s in a one-off

In [51]:

def generate_batch_predictions(prompts, batch_size=4):
    """
    Generate predictions for a batch of prompts.
    
    Args:
        prompts (list of str): List of input prompts.
        batch_size (int): Number of prompts to process per batch.
        
    Returns:
        List of str: List of generated outputs for each input prompt.
    """
    all_predictions = []
    for i in range(0, len(prompts), batch_size):
        # Select a batch of prompts
        batch = prompts[i:i + batch_size]
        
        # Tokenize batch
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
        
        # Generate outputs for the batch
        outputs = model.generate(**inputs, max_new_tokens=150)
        
        # Decode outputs
        predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        all_predictions.extend(predictions)
    
    return all_predictions



 


In [47]:
ds["prompt"]

TypeError: list indices must be integers or slices, not str

In [54]:
tokenizer.pad_token = tokenizer.eos_token 
pr=generate_batch_predictions(ds, batch_size=10)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [6]:
def generate_prediction(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [8]:
output=generate_prediction(formatted_dataset[0]["prompt"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Extract named entities from the following text:\n"In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed `` George Hall and His Hotel Taft Orchestra ``." \n"George Hall and His Hotel Taft Orchestra was a renowned band of the 1930s, known for its energetic performances and lively music.  The band\'s music was a mix of jazz and classical, with a strong emphasis on brass instruments and percussion."\n\nExtracted named entities:\n\n* George Hall (person)\n* Taft Hotel (location)\n* New York (location)\n* Grill Room (location)\n* George Hall and His Hotel Taft Orchestra (group) \n* George Hall (person) (second occurrence)'

In [10]:
import re

def extract_entities_from_llama_response(response):
    # Example regex to extract entities
    entities = []
    for match in re.finditer(r"(\b[\w\s]+\b):\s*(\w+)", response):
        entity, label = match.groups()
        entities.append({"entity": entity.strip(), "label": label})
    return entities

In [22]:
TARGET_ENTITY_TYPES = [
    "person",
    "art",
    "product",
    "location",
    "event",
    "organization",
    "other",
    "building"
]


In [27]:
import re

def extract_entities(llama_output):
    """
    Extract entities and types from LLaMA's predictions and filter by TARGET_ENTITY_TYPES.
    """
    pattern = r"\* ([\w\s]+) \(([\w\s]+)\)"
    entities = []

    for match in re.finditer(pattern, llama_output):
        entity, entity_type = match.groups()
        if entity_type in TARGET_ENTITY_TYPES:  # Apply the filter here
            entities.append((entity.strip(), entity_type.strip()))

    return entities


In [38]:
generated_output=[]
for i in range(1000):
    output=generate_prediction(formatted_dataset[i]["prompt"])
    output1=extract_entities(output)
    generated_output.append(output1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

KeyboardInterrupt: 

In [36]:
a

[('George Hall', 'person'),
 ('Taft Hotel', 'location'),
 ('New York', 'location'),
 ('Grill Room', 'location'),
 ('George Hall', 'person')]

In [None]:
def convert_ground_truth(entry):
    """
    Convert NER annotations into a list of entities with token spans and types.
    """
    tokenized_text = entry["tokenized_text"]
    ner = entry["ner"]
    entities = []

    for start, end, entity_type in ner:
        entity_text = " ".join(tokenized_text[start:end])
        entities.append((entity_text, entity_type))

    return entities


ground_truth_entities = [convert_ground_truth(entry) for entry in dataset]

# Print ground truth for the first example
print(ground_truth_entities[0])
# Output: [('Grill Room', 'building'), ('Taft Hotel', 'building'), ('New York', 'location'), ('George Hall and His Hotel Taft Orchestra', 'organization')]


[('Grill', 'building'), ('Taft', 'building'), ('New', 'location'), ('George Hall and His Hotel Taft', 'organization')]


'Extract named entities from the following text:\n"In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed `` George Hall and His Hotel Taft Orchestra ``." \n"George Hall and His Hotel Taft Orchestra was a renowned band of the 1930s, known for its energetic performances and lively music.  The band\'s music was a mix of jazz and classical, with a strong emphasis on brass instruments and percussion."\n\nExtracted named entities:\n\n* George Hall (person)\n* Taft Hotel (location)\n* New York (location)\n* Grill Room (location)\n* George Hall and His Hotel Taft Orchestra (group) \n* George Hall (person) (second occurrence)'

In [37]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate_ner(ground_truth, predictions):
    """
    Compute precision, recall, and F1-score for NER evaluation.
    """
    # Convert to sets for comparison
    ground_truth_set = set(ground_truth)
    predictions_set = set(predictions)

    # True positives, false positives, false negatives
    true_positives = ground_truth_set & predictions_set
    false_positives = predictions_set - ground_truth_set
    false_negatives = ground_truth_set - predictions_set

    # Metrics
    precision = len(true_positives) / len(predictions_set) if predictions_set else 0
    recall = len(true_positives) / len(ground_truth_set) if ground_truth_set else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "true_positives": true_positives,
        "false_positives": false_positives,
        "false_negatives": false_negatives,
    }

# Evaluate for the first example
evaluation_results = evaluate_ner(ground_truth_entities[0], predicted_entities)

# Print evaluation results
print("Precision:", evaluation_results["precision"])
print("Recall:", evaluation_results["recall"])
print("F1 Score:", evaluation_results["f1"])
print("True Positives:", evaluation_results["true_positives"])
print("False Positives:", evaluation_results["false_positives"])
print("False Negatives:", evaluation_results["false_negatives"])


NameError: name 'predicted_entities' is not defined