In [1]:
import re
from difflib import SequenceMatcher
from typing import List, Tuple
import os

def read_document_in_chunks(file_path: str, chunk_size: int = 4096) -> List[str]:
    """
    Reads a document in chunks and returns a list of chunks.

    Args:
        file_path (str): The path to the document file.
        chunk_size (int, optional): The size of each chunk. Defaults to 4096.

    Returns:
        List[str]: A list of strings, where each string is a chunk of the document.
                     Returns an empty list if the file cannot be read.
    """
    chunks = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                chunks.append(chunk)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return []
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return []
    return chunks

def calculate_similarity(text1: str, text2: str) -> float:
    """
    Calculates the similarity between two texts using the SequenceMatcher.

    Args:
        text1 (str): The first text.
        text2 (str): The second text.

    Returns:
        float: The similarity score, a value between 0 and 1 (inclusive).
               Returns 0.0 if either input is empty.
    """
    if not text1 or not text2:
        return 0.0
    return SequenceMatcher(None, text1, text2).ratio()

def compare_documents_chunk_by_chunk(file_path1: str, file_path2: str) -> List[Tuple[Tuple[int, int], float]]:
    """
    Compares two documents chunk by chunk and returns a list of similarity scores
    for all possible chunk combinations.

    Args:
        file_path1 (str): The path to the first document.
        file_path2 (str): The path to the second document.

    Returns:
        List[Tuple[Tuple[int, int], float]]: A list of tuples. Each tuple contains:
            - A tuple of chunk indices (i, j), where i is the index of the chunk
              from file1 and j is the index of the chunk from file2.
            - The similarity score (float) between the two chunks.
            Returns an empty list if either file is empty or an error occurs.
    """
    chunks1 = read_document_in_chunks(file_path1)
    chunks2 = read_document_in_chunks(file_path2)

    if not chunks1 or not chunks2:
        print("One or both documents are empty.")
        return []

    results = []
    for i, chunk1 in enumerate(chunks1):
        for j, chunk2 in enumerate(chunks2):
            similarity = calculate_similarity(chunk1, chunk2)
            results.append(((i, j), similarity))
    return results

def main():
    """
    Main function to demonstrate the usage of the
    compare_documents_chunk_by_chunk function. Prompts the user for the file paths
    and displays the comparison results.
    """
    file_path1 = input("Enter the path to the first document: ")
    file_path2 = input("Enter the path to the second document: ")

    # Check if files exist before proceeding.
    if not os.path.exists(file_path1):
        print(f"Error: File not found at {file_path1}")
        return
    if not os.path.exists(file_path2):
        print(f"Error: File not found at {file_path2}")
        return

    print("\nComparing documents chunk by chunk...")
    results = compare_documents_chunk_by_chunk(file_path1, file_path2)

    if not results:
        print("No comparisons were made (either files are empty or there was an error).")
        return

    print("\nChunk Comparison Results:")
    for (i, j), similarity in results:
        print(f"Chunk {i + 1} (Doc1) vs. Chunk {j + 1} (Doc2): Similarity = {similarity:.4f}")

In [2]:
import re

def extract_sentences_with_numbers(file_path, chunk_size=4096):
    """
    Reads a long document in chunks, identifies sentences containing numbers,
    and yields them. Handles potential issues with sentence boundary
    detection within chunks.

    Args:
        file_path (str): The path to the document file.
        chunk_size (int, optional): The size of each chunk read from the file.
            Defaults to 4096 (4KB). Adjust based on your system's memory
            and the expected sentence length.

    Yields:
        str: Sentences from the document that contain at least one digit.
             Yields one sentence at a time.
    """
    # Regular expression to find any digit (0-9).
    number_regex = re.compile(r'\d')
    # Regular expression to find sentence boundaries. Improved to handle
    # more cases, including abbreviations, multiple punctuation, and
    # non-ASCII characters.
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')


    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break # End of file

                # Split the chunk into sentences, handling potential issues
                # where a sentence may span across chunk boundaries.
                sentences = sentence_endings.split(chunk)
                # Process sentences, one by one
                for sentence in sentences:
                    if number_regex.search(sentence):
                        yield sentence.strip() # Remove leading/trailing spaces
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

def main():
    """
    Main function to demonstrate the usage of the
    extract_sentences_with_numbers function. Prompts the user for
    the file path and processes the file.
    """
    file_path = input("Enter the path to the document file: ")
    # Check if file exists *before* passing to the generator.
    import os
    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return

    print("\nSentences containing numbers:")
    for sentence in extract_sentences_with_numbers(file_path):
        print(sentence) # Print each sentence as it's yielded

if __name__ == "__main__":
    main()


Enter the path to the document file:  F:\DSA_Task\Natural_Language_Processing\Don Wisidagama\Language Technology Text Classification\Product_Classification_Paper.docx



Sentences containing numbers:
An error occurred while reading the file: 'utf-8' codec can't decode byte 0xd2 in position 14: invalid continuation byte


In [3]:
import re
from docx import Document # Import the Document class from python-docx

def extract_sentences_with_numbers(file_path):
    """
    Reads a Word document, identifies sentences containing numbers,
    and yields them.

    Args:
        file_path (str): The path to the .docx document file.

    Yields:
        str: Sentences from the document that contain at least one digit.
             Yields one sentence at a time.
    """
    # Regular expression to find any digit (0-9).
    number_regex = re.compile(r'\d')
    # Regular expression to find sentence boundaries.
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')

    try:
        # Load the Word document
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        
        # Join all paragraphs into a single string for sentence splitting
        document_content = "\n".join(full_text)

        # Split the entire document content into sentences
        sentences = sentence_endings.split(document_content)
        
        for sentence in sentences:
            if number_regex.search(sentence):
                yield sentence.strip() # Remove leading/trailing spaces

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    """
    Main function to demonstrate the usage of the
    extract_sentences_with_numbers function. Prompts the user for
    the file path and processes the file.
    """
    file_path = input("Enter the path to the Word document (.docx) file: ")
    
    import os
    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print("\nSentences containing numbers:")
    for sentence in extract_sentences_with_numbers(file_path):
        print(sentence)

if __name__ == "__main__":
    main()

Enter the path to the Word document (.docx) file:  F:\DSA_Task\Natural_Language_Processing\Don Wisidagama\Language Technology Text Classification\Product_Classification_Paper.docx



Sentences containing numbers:
The results have shown that logistic regression model achieved Precision of 96.5% for one category and 90% for three categories.
F1 score for IMDB dataset is 72.90%, 72.89% and 72.64% for 1, 3 and 5-grams model.
For Amazon data, F1-score is 82.13%, 81.80% and 82.15% for 1-, 3- and 5-gram models.
Here, n can be any positive integer and explained in detail as follows:
Unigram or 1-gram is feature with one word in the document.
Bigram or 2-grams is feature with two words in the document occurring together.
Trigram or 3-grams is feature with three words in the document occurring together.
The file has a header row followed by 45,895 rows containing an ID, a product title, text for the product description, and the product category.
There is one row for missing title and 1042    rows for missing descriptions The rows with missing data are removed for further analysis.
90% data included in the train data and remaining 10% in the test data.
Results and Analysis: 

In [4]:
import re
from docx import Document
# You would need a PDF library like 'PyPDF2' or 'pdfminer.six' for PDF conversion
# and then text extraction with coordinates, which is a separate complex task.
# from PyPDF2 import PdfReader # Example for PDF, not directly used here for docx page numbers

def extract_sentences_and_numbers(file_path):
    """
    Reads a Word document, identifies sentences containing numbers,
    and yields them along with the extracted numbers.

    Args:
        file_path (str): The path to the .docx document file.

    Yields:
        tuple: A tuple containing:
            - str: The sentence from the document that contains at least one digit.
            - list: A list of integers or floats found within that sentence.
    """
    # Regular expression to find any digit (0-9).
    number_regex = re.compile(r'\d')
    # Regular expression to find all numbers (integers and floats) in a sentence.
    # This will capture sequences of digits, optionally with a decimal point.
    all_numbers_regex = re.compile(r'\b\d+(?:[\.,]\d+)?\b') # Handles comma or period as decimal
    # Regular expression to find sentence boundaries.
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')

    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        
        document_content = "\n".join(full_text)

        sentences = sentence_endings.split(document_content)
        
        for sentence in sentences:
            if number_regex.search(sentence):
                extracted_numbers = []
                for match in all_numbers_regex.finditer(sentence):
                    # Convert to float for potential decimal numbers, then back to int if whole
                    try:
                        num = float(match.group().replace(',', '.')) # Replace comma with period for conversion
                        if num == int(num):
                            extracted_numbers.append(int(num))
                        else:
                            extracted_numbers.append(num)
                    except ValueError:
                        # Should not happen with the regex, but good for robustness
                        pass
                yield sentence.strip(), extracted_numbers

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    """
    Main function to demonstrate the usage of the
    extract_sentences_and_numbers function. Prompts the user for
    the file path and processes the file.
    """
    file_path = input("Enter the path to the Word document (.docx) file: ")
    
    import os
    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print("\nProcessing document for sentences with numbers...")
    found_any = False
    for sentence, numbers in extract_sentences_and_numbers(file_path):
        found_any = True
        print(f"Sentence: {sentence}")
        print(f"  Extracted Numbers: {numbers}\n")

    if not found_any:
        print("No sentences containing numbers were found in the document.")

    print("\n--- Regarding Page Numbers ---")
    print("Directly extracting page numbers from a .docx file using 'python-docx' is not feasible.")
    print("This library works with the logical structure of the document, not its rendered layout.")
    print("To get page numbers, you would typically need to:")
    print("1. Convert the .docx to PDF, then use a PDF parsing library (e.g., PyPDF2, pdfminer.six) to extract text with positional information.")
    print("2. Use a Word automation tool (e.g., 'pywin32' on Windows) to interact with Microsoft Word itself, which can be complex and platform-specific.")
    print("This code only extracts sentences with numbers and the numbers themselves.")


if __name__ == "__main__":
    main()

Enter the path to the Word document (.docx) file:  F:\DSA_Task\Natural_Language_Processing\Don Wisidagama\Language Technology Text Classification\Product_Classification_Paper.docx



Processing document for sentences with numbers...
Sentence: The results have shown that logistic regression model achieved Precision of 96.5% for one category and 90% for three categories.
  Extracted Numbers: [96.5, 90]

Sentence: F1 score for IMDB dataset is 72.90%, 72.89% and 72.64% for 1, 3 and 5-grams model.
  Extracted Numbers: [72.9, 72.89, 72.64, 1, 3, 5]

Sentence: For Amazon data, F1-score is 82.13%, 81.80% and 82.15% for 1-, 3- and 5-gram models.
  Extracted Numbers: [82.13, 81.8, 82.15, 1, 3, 5]

Sentence: Here, n can be any positive integer and explained in detail as follows:
Unigram or 1-gram is feature with one word in the document.
  Extracted Numbers: [1]

Sentence: Bigram or 2-grams is feature with two words in the document occurring together.
  Extracted Numbers: [2]

Sentence: Trigram or 3-grams is feature with three words in the document occurring together.
  Extracted Numbers: [3]

Sentence: The file has a header row followed by 45,895 rows containing an ID, a pr

In [None]:
#pip install transformers

In [5]:
import re
from docx import Document
from transformers import pipeline

# Load a pre-trained NER pipeline.
# Note: This model is trained on general entities (PERSON, ORG, LOC, MISC),
# not specific metric types like 'Recall' or 'Precision'.
# We'll use this for demonstration, but for your specific need, you'd need a custom-trained model.
try:
    # Attempt to load a general NER model. You might need to install 'dslim/bert-base-NER'
    # if it's not cached, or choose another suitable general NER model.
    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
    #ner_pipeline = pipeline("ner", aggregation_strategy="simple")
except Exception as e:
    print(f"Could not load NER model. Please ensure 'dslim/bert-base-NER' is available or installed: {e}")
    print("Falling back to keyword-based number extraction without HuggingFace tagging.")
    ner_pipeline = None # Set to None if model loading fails

def extract_sentences_and_tag_numbers(file_path):
    """
    Reads a Word document, identifies sentences containing numbers,
    and attempts to tag these numbers based on surrounding keywords,
    using a (conceptual) NER approach for illustration.

    Args:
        file_path (str): The path to the .docx document file.

    Yields:
        tuple: A tuple containing:
            - str: The sentence from the document that contains at least one digit.
            - list: A list of tuples, where each tuple contains (number, inferred_tag).
    """
    number_regex = re.compile(r'\d')
    all_numbers_regex = re.compile(r'\b\d+(?:[\.,]\d+)?\b') # Handles comma or period as decimal
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')

    # Define keywords for tagging. This is a *simplistic heuristic* for demonstration.
    # A real solution requires a fine-tuned NER model.
    keyword_tags = {
        "recall": "Recall Metric",
        "precision": "Precision Metric",
        "accuracy": "Accuracy Metric",
        "f1-score": "F1-Score Metric",
        "temperature": "Temperature Reading",
        "cost": "Cost/Price",
        "price": "Cost/Price",
        "revenue": "Financial Value",
        "profit": "Financial Value",
        "loss": "Financial Value",
        "rate": "Rate/Percentage",
        "percentage": "Rate/Percentage",
        "count": "Count/Quantity",
        "quantity": "Count/Quantity",
        "total": "Total Value",
        "score": "Score",
        "value": "Generic Value" # Catch-all, less specific
    }

    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        
        document_content = "\n".join(full_text)

        sentences = sentence_endings.split(document_content)
        
        for sentence in sentences:
            if number_regex.search(sentence):
                tagged_numbers = []
                numbers_in_sentence = []

                # Extract all numbers first
                for match in all_numbers_regex.finditer(sentence):
                    try:
                        num_str = match.group().replace(',', '.')
                        num = float(num_str)
                        if num == int(num):
                            numbers_in_sentence.append((int(num), match.span()))
                        else:
                            numbers_in_sentence.append((num, match.span()))
                    except ValueError:
                        pass # Skip if conversion fails

                # Attempt to tag each number
                for num_value, num_span in numbers_in_sentence:
                    inferred_tag = "Untagged Number" # Default tag

                    # Heuristic 1: Check nearby keywords (simple window)
                    start_index, end_index = num_span
                    # Check text before the number (e.g., 20 characters before)
                    context_before = sentence[max(0, start_index - 30):start_index].lower()
                    # Check text after the number (e.g., 20 characters after)
                    context_after = sentence[end_index:min(len(sentence), end_index + 30)].lower()

                    found_keyword = False
                    for keyword, tag in keyword_tags.items():
                        if keyword in context_before or keyword in context_after:
                            inferred_tag = tag
                            found_keyword = True
                            break
                    
                    if found_keyword:
                        tagged_numbers.append((num_value, inferred_tag))
                        continue # Move to the next number

                    # Heuristic 2 (Conceptual): Use general NER model if available
                    # This is for illustration. A general NER model won't output
                    # "Recall Metric". It might tag "Recall" as O and "0.85" as O or MISC.
                    # A *custom-trained NER model* would be needed here.
                    if ner_pipeline:
                        ner_results = ner_pipeline(sentence)
                        for entity in ner_results:
                            # Check if the number's span overlaps with a recognized entity span
                            entity_start = entity['start']
                            entity_end = entity['end']

                            # Check if the number is part of an entity or very close
                            # This logic is highly simplified and needs robust tuning for a real model
                            if (entity_start <= start_index < entity_end) or \
                               (entity_start <= end_index <= entity_end) or \
                               (start_index <= entity_start and end_index >= entity_end):
                                
                                # This is where a custom NER model would give specific tags
                                # For a general model, 'entity_group' might be 'MISC' or 'ORG' etc.
                                # We're still relying on heuristics here.
                                if entity['entity_group'] in ["MISC", "LOC", "ORG", "PER"]:
                                    # Fallback if a general entity type is found near the number
                                    inferred_tag = f"General NER Tag: {entity['entity_group']}"
                                    # If the entity is a specific keyword (like "Recall" itself),
                                    # then we could combine it with the number.
                                    # This is where a fine-tuned model truly shines.
                                    # For example, if "recall" was tagged as B-METRIC and 0.85 as I-METRIC
                                    # then we would combine them.
                                    break # Assume the first relevant NER tag is sufficient

                    tagged_numbers.append((num_value, inferred_tag))
                
                if tagged_numbers: # Only yield if we found and potentially tagged numbers
                    yield sentence.strip(), tagged_numbers

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    """
    Main function to demonstrate the usage of the
    extract_sentences_and_tag_numbers function.
    """
    file_path = input("Enter the path to the Word document (.docx) file: ")
    
    import os
    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print("\nProcessing document for sentences with numbers and tagging...")
    found_any = False
    for sentence, tagged_numbers in extract_sentences_and_tag_numbers(file_path):
        found_any = True
        print(f"Sentence: {sentence}")
        for num, tag in tagged_numbers:
            print(f"  Number: {num}, Tag: {tag}")
        print("-" * 50) # Separator for readability

    if not found_any:
        print("No sentences containing numbers were found in the document.")

    print("\n--- Important Note on Tagging Metrics with Hugging Face Transformers ---")
    print("The tagging in this code uses simple keyword matching and a general NER model (if loaded).")
    print("For robustly tagging numbers with specific semantic categories (e.g., 'Recall Metric', 'Precision Metric'),")
    print("you would need to:")
    print("1. **Create a custom dataset:** Manually annotate sentences where numbers are linked to your desired categories.")
    print("2. **Fine-tune a Hugging Face Transformer model:** Train a model (e.g., BERT for token classification) on this custom dataset.")
    print("   This is a significant machine learning project requiring labeled data and training resources.")
    print("The current keyword-based approach is a basic heuristic and will not be as accurate or comprehensive.")

if __name__ == "__main__":
    main()




Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Enter the path to the Word document (.docx) file:  F:\DSA_Task\Natural_Language_Processing\Don Wisidagama\Language Technology Text Classification\Product_Classification_Paper.docx



Processing document for sentences with numbers and tagging...
Sentence: The results have shown that logistic regression model achieved Precision of 96.5% for one category and 90% for three categories.
  Number: 96.5, Tag: Precision Metric
  Number: 90, Tag: Untagged Number
--------------------------------------------------
Sentence: F1 score for IMDB dataset is 72.90%, 72.89% and 72.64% for 1, 3 and 5-grams model.
  Number: 72.9, Tag: Score
  Number: 72.89, Tag: Untagged Number
  Number: 72.64, Tag: Untagged Number
  Number: 1, Tag: Untagged Number
  Number: 3, Tag: Untagged Number
  Number: 5, Tag: Untagged Number
--------------------------------------------------
Sentence: For Amazon data, F1-score is 82.13%, 81.80% and 82.15% for 1-, 3- and 5-gram models.
  Number: 82.13, Tag: F1-Score Metric
  Number: 81.8, Tag: F1-Score Metric
  Number: 82.15, Tag: Score
  Number: 1, Tag: Untagged Number
  Number: 3, Tag: Untagged Number
  Number: 5, Tag: Untagged Number
-------------------------

### Performance Keyword and Performance value extraction
#### Use any LLM model to extract performance keyword and its value from a sentence. Do not use heuristic based approach as doe earlier. The input will be word document

In [8]:
import re
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json # Ensure json is imported at the top for clarity

# --- 1. Load the LLM for Extraction ---
# Using HuggingFaceH4/zephyr-7b-beta as it's truly open and instruction-tuned.
# This model will be downloaded to your local cache. No API key needed.
# NOTE: This is a 7B model. It requires significant RAM/VRAM (e.g., 8GB+ VRAM, 16GB+ RAM).
# If you face CUDA out of memory, try removing `torch_dtype=torch.bfloat16`
# If you don't have a GPU, remove `device_map="auto"`
try:
    #model_name = "HuggingFaceH4/zephyr-7b-beta"
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    #model_name = "bert-base-uncased"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16, # Use bfloat16 for efficiency if your GPU supports it, or torch.float16
        device_map="auto" # Automatically place model layers on available devices (GPU/CPU)
    )

    # Create a text generation pipeline for the LLM
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100, # Limit the LLM's output length
        do_sample=False,    # For deterministic output
        temperature=0.0,    # For deterministic output
        top_p=0.9,
    )
    print(f"Successfully loaded LLM: {model_name}")

except Exception as e:
    print(f"Error loading LLM model '{model_name}': {e}")
    print("This error usually indicates insufficient RAM/VRAM or a corrupted download.")
    print("If you are running on CPU, remove `torch_dtype=torch.bfloat16` and `device_map='auto'`.")
    print("Consider using a smaller model if you have limited resources (e.g., a 1B or 3B parameter model).")
    llm_pipeline = None # Indicate that the LLM is not available





def extract_info_with_llm(sentence, llm_pipeline):
    """
    Uses an LLM to extract performance keywords and their values from a sentence.

    Args:
        sentence (str): The input sentence.
        llm_pipeline: The Hugging Face pipeline for text generation (LLM).

    Returns:
        list: A list of dictionaries, each containing {'keyword': str, 'value': float/int}.
              Returns an empty list if no information is extracted.
    """
    if not llm_pipeline:
        return []

    # Zephyr uses a specific chat template for best performance
    messages = [
        {"role": "system", "content": "You are an expert at extracting performance metrics from text. Extract keywords and their numerical values."},
        {"role": "user", "content": f"""Extract performance keywords and their corresponding numerical values from the following sentence.
        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.
        Output the information as a JSON list, where each item is an object with 'keyword' and 'value' fields. The value should be a number (integer or float).
        If no performance keywords and values are found, output an empty JSON list [].

        Sentence: "{sentence}"
        Output:
        """}
    ]

    # Apply the tokenizer's chat template
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    try:
        generated_text = llm_pipeline(formatted_prompt, stop_sequence=["\n\n", "```"])
        print(generated_text)
        
        # Extract the relevant part of the generated text
        # LLMs often output the prompt again, then their answer.
        # We need to find the part after the prompt and look for the JSON.
        llm_output = generated_text[0]['generated_text'].replace(formatted_prompt, "").strip()
        print(llm_output)

        # Try to parse the JSON output
        json_start = llm_output.find('[')
        json_end = llm_output.rfind(']')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            json_string = llm_output[json_start : json_end + 1]
            json_string = json_string.strip()

            try:
                extracted_data = json.loads(json_string)
                if isinstance(extracted_data, list):
                    valid_extractions = []
                    for item in extracted_data:
                        if isinstance(item, dict) and 'keyword' in item and 'value' in item:
                            try:
                                item['value'] = float(item['value'])
                                if item['value'] == int(item['value']):
                                    item['value'] = int(item['value'])
                                valid_extractions.append(item)
                            except ValueError:
                                pass # Skip if value is not a valid number
                    return valid_extractions
                return []
            except json.JSONDecodeError as jde:
                print(f"Warning: Could not parse JSON from LLM output: {jde}")
                print(f"LLM Raw Output snippet: {llm_output[:200]}...")
                return []
        else:
            print(f"Warning: LLM did not output expected JSON structure. Raw output snippet: {llm_output[:200]}...")
            return []

    except Exception as e:
        print(f"An error occurred during LLM inference: {e}")
        return []

Device set to use cpu


Successfully loaded LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [9]:
# --- Test Cases ---
if llm_pipeline: # Only run tests if the LLM loaded successfully
    print("\n--- Running Test Cases ---")

    test_sentences = [
        "The model achieved a recall of 0.92 and a precision of 0.88.",
        "Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds.",
        "The error rate was 0.01% in the last quarter.",
        "Expected uptime is 99.9% for the server.",
        "The project cost is $1,250,000.", # LLM might not recognize "cost" as performance metric
        "This sentence has no specific performance metrics.",
        "Accuracy improved to 95.5%."
    ]

    for i, sentence in enumerate(test_sentences):
        print(f"\nTest Case {i+1}:")
        print(f"Sentence: \"{sentence}\"")
        
        extracted_metrics = extract_info_with_llm(sentence, llm_pipeline)
        
        if extracted_metrics:
            print("Extracted Metrics:")
            for item in extracted_metrics:
                print(f"  - Keyword: '{item['keyword']}', Value: {item['value']}")
        else:
            print("No performance metrics extracted.")
        print("=" * 70)
else:
    print("\nSkipping test cases because the LLM pipeline failed to load.")



--- Running Test Cases ---

Test Case 1:
Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."




[{'generated_text': '<|system|>\nYou are an expert at extracting performance metrics from text. Extract keywords and their numerical values.</s>\n<|user|>\nExtract performance keywords and their corresponding numerical values from the following sentence.\n        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.\n        Output the information as a JSON list, where each item is an object with \'keyword\' and \'value\' fields. The value should be a number (integer or float).\n        If no performance keywords and values are found, output an empty JSON list [].\n\n        Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."\n        Output:\n        </s>\n<|assistant|>\nJSON response:\n'}]
JSON response:
No performance metrics extracted.

Test Case 2:
Sentence: "Our system's throughput reached 1200 requests per second, with a latency of only

In [None]:
# TinyLlama/TinyLlama-1.1B-Chat-v1.0

In [10]:
import re
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os

# --- 1. Load the LLM for Extraction ---
llm_pipeline = None

try:
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

    # --- IMPORTANT ADJUSTMENTS FOR COMPATIBILITY ---
    # 1. For CPU-only or older/incompatible GPUs:
    #    Remove `torch_dtype` and `device_map="auto"`.
    #    This will load the model in float32 (default) on your CPU.
    #    It will be slower than GPU, but highly compatible.
    
    # 2. If you have a modern NVIDIA GPU that supports bfloat16:
    #    You can keep `torch_dtype=torch.bfloat16` and `device_map="auto"`.
    
    # 3. If you have a modern NVIDIA GPU that supports float16 but NOT bfloat16:
    #    Change `torch_dtype=torch.float16` and keep `device_map="auto"`.

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # COMMENT OUT OR REMOVE THESE LINES IF YOU ARE ON CPU OR GET ERRORS RELATED TO BFLOAT16/DEVICE
        # torch_dtype=torch.bfloat16,
        # device_map="auto" 
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        do_sample=False,
        temperature=0.0,
        top_p=0.9,
        # Ensure that the model is loaded with no special quantization config unless specified by user.
        # If you were using 4-bit quantization, you'd add:
        # quantization_config=bnb_config,
    )
    print(f"Successfully loaded LLM: {model_name}")

except Exception as e:
    print(f"Error loading LLM model '{model_name}': {e}")
    print("\n--- Troubleshooting Tips for LLM Loading ---")
    print("1. **Check RAM/VRAM:** Ensure you have enough memory (e.g., 2GB+ for TinyLlama).")
    print("2. **CPU Compatibility:** If on CPU, **remove `torch_dtype` and `device_map='auto'`** from `AutoModelForCausalLM.from_pretrained`.")
    print("3. **GPU `torch_dtype`:** If on GPU, try `torch_dtype=torch.float16` if `bfloat16` fails.")
    print("4. **Corrupted Download:** If it persists, delete the model from your Hugging Face cache and try again.")
    print("   (e.g., `huggingface-cli delete-cache TinyLlama/TinyLlama-1.1B-Chat-v1.0`)")
    llm_pipeline = None


def extract_info_with_llm(sentence, llm_pipeline):
    """
    Uses an LLM to extract performance keywords and their values from a sentence.
    """
    if not llm_pipeline:
        return []

    # Simplified prompt for TinyLlama: Direct user message.
    # Smaller models sometimes struggle with the "system" role.
    messages = [
        {"role": "user", "content": f"""Extract performance keywords and their corresponding numerical values from the following sentence.
        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.
        Output the information as a JSON list, where each item is an object with 'keyword' and 'value' fields. The value should be a number (integer or float).
        If no performance keywords and values are found, output an empty JSON list [].

        Sentence: "{sentence}"
        Output:
        """}
    ]

    # Apply the tokenizer's chat template
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    try:
        # Crucial for small models: Loosen stop_sequence or remove it.
        # Let's try to let it generate more freely and rely on our parsing.
        # TinyLlama might also generate extra text *before* the JSON.
        generated_text = llm_pipeline(formatted_prompt, stop_sequence=["\n\n", "```", "\nOutput:", "\n[]"]) # Added more specific stop sequences for common LLM output patterns
        
        # --- DEBUGGING PRINTS ---
        print("\n--- LLM Raw Full Output (for debugging) ---")
        # Ensure we're printing the text attribute, not the entire dict
        print(generated_text[0]['generated_text']) 
        print("---------------------------------------------")

        # The replace method might be too strict if the LLM adds extra space/newlines.
        # Instead, let's just look for the part after the prompt.
        raw_llm_response = generated_text[0]['generated_text'].replace(formatted_prompt, "").strip()
        
        # --- DEBUGGING PRINTS ---
        print("\n--- LLM Response After Prompt Removal ---")
        print(raw_llm_response)
        print("------------------------------------------")

        # --- Enhanced JSON Parsing ---
        # 1. Look for markdown code blocks (common for structured LLM output)
        json_code_block_match = re.search(r'```json\s*(.*?)\s*```', raw_llm_response, re.DOTALL)
        if json_code_block_match:
            json_string_candidate = json_code_block_match.group(1)
            print("Found JSON in code block.") # Debugging
        else:
            json_string_candidate = raw_llm_response
            print("No JSON code block found, trying direct parse.") # Debugging

        # 2. Look for the first '[' and last ']' in the *candidate* string
        json_start = json_string_candidate.find('[')
        json_end = json_string_candidate.rfind(']')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            json_string = json_string_candidate[json_start : json_end + 1]
            json_string = json_string.strip()
            print(f"Attempting to parse JSON string: {json_string}") # Debugging

            try:
                extracted_data = json.loads(json_string)
                if isinstance(extracted_data, list):
                    valid_extractions = []
                    for item in extracted_data:
                        if isinstance(item, dict) and 'keyword' in item and 'value' in item:
                            try:
                                item['value'] = float(item['value'])
                                if item['value'] == int(item['value']):
                                    item['value'] = int(item['value'])
                                valid_extractions.append(item)
                            except ValueError:
                                # Skip if value is not a valid number
                                print(f"Warning: Invalid value '{item.get('value')}' for keyword '{item.get('keyword')}'. Skipping.") # Debugging
                                pass 
                    return valid_extractions
                print("Warning: JSON parsed but not a list.") # Debugging
                return []
            except json.JSONDecodeError as jde:
                print(f"Warning: Could not parse JSON from LLM output: {jde}")
                print(f"LLM Raw Output snippet causing error: {json_string_candidate[:200]}...")
                return []
        else:
            print(f"Warning: LLM did not output expected JSON structure ([...]). Raw output snippet: {raw_llm_response[:200]}...")
            return []

    except Exception as e:
        print(f"An error occurred during LLM inference: {e}")
        return []

# --- Rest of your existing document processing code (unchanged) ---

def extract_sentences_from_docx(file_path):
    """
    Reads a Word document and yields sentences.
    """
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')

    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        
        document_content = "\n".join(full_text)
        sentences = sentence_endings.split(document_content)
        
        for sentence in sentences:
            stripped_sentence = sentence.strip()
            if stripped_sentence:
                yield stripped_sentence

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    """
    Main function to demonstrate LLM-based extraction from a Word document.
    """
    if not llm_pipeline:
        print("LLM pipeline not initialized. Cannot perform LLM-based extraction.")
        return

    # Use a dummy .docx file for testing if you don't have one readily available
    # Create a dummy_doc.docx with some text for testing:
    # "The model achieved a recall of 0.92 and a precision of 0.88. Our system's throughput reached 1200 requests per second."
    dummy_doc_path = "dummy_doc.docx"
    import os
    if not os.path.exists(dummy_doc_path):
        print(f"\n--- Creating a dummy '{dummy_doc_path}' for testing ---")
        doc = Document()
        doc.add_paragraph("The model achieved a recall of 0.92 and a precision of 0.88. Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds.")
        doc.add_paragraph("The error rate was 0.01% in the last quarter. Expected uptime is 99.9% for the server.")
        doc.add_paragraph("This sentence has no specific performance metrics. Accuracy improved to 95.5%.")
        doc.add_paragraph("Project budget was $1,250,000. Client satisfaction improved by 20%.") # Testing for non-performance metrics
        doc.save(dummy_doc_path)
        print(f"Dummy document created at: {dummy_doc_path}")

    # file_path = input("Enter the path to the Word document (.docx) file: ") # Use dummy for quick test
    file_path = dummy_doc_path # Use the generated dummy document

    import os
    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print(f"\nProcessing document '{os.path.basename(file_path)}' for performance metrics using LLM...")
    found_any = False
    
    number_regex = re.compile(r'\d') # To filter sentences for LLM calls

    for sentence in extract_sentences_from_docx(file_path):
        if number_regex.search(sentence):
            print(f"\nProcessing Sentence: \"{sentence}\"") # Debugging: show which sentence is being processed
            extracted_metrics = extract_info_with_llm(sentence, llm_pipeline)
            
            if extracted_metrics:
                found_any = True
                print("Extracted Metrics:")
                for item in extracted_metrics:
                    print(f"  - Keyword: '{item['keyword']}', Value: {item['value']}")
                print("=" * 70)
            else:
                print("No performance metrics extracted for this sentence.")
                print("=" * 70)

    if not found_any:
        print("No performance metrics found in the entire document using the LLM.")

    print("\n--- LLM-based Extraction Notes ---")
    print("This approach leverages the LLM's understanding of instructions and context.")
    print("Performance (speed and accuracy) will depend on:")
    print("1. The chosen LLM model (smaller models are faster but less accurate).")
    print("2. Your hardware (GPU is highly recommended for LLMs).")
    print("3. The clarity and specificity of the prompt engineering.")
    print("Errors in LLM output (e.g., malformed JSON) are possible and handled with error checks.")


if __name__ == "__main__":
    main()

Device set to use cpu


Successfully loaded LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0

Processing document 'dummy_doc.docx' for performance metrics using LLM...

Processing Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sentence.

Processing Sentence: "Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sentence.

Processing Sentence: "The error rate was 0.01% in the last quarter."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sentence.

Processing Sentence: "Expected uptime is 99.9% for th

In [11]:
import re
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os 

# --- 1. Load the LLM for Extraction (no changes here as loading is successful) ---
llm_pipeline = None

try:
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

    # Determine device for optimal loading
    if torch.cuda.is_available():
        device = "cuda"
        dtype = torch.bfloat16
        try:
            _ = torch.randn(1, 1).to(device).to(torch.bfloat16)
        except Exception:
            print("Warning: bfloat16 not supported by GPU, falling back to float16.")
            dtype = torch.float16
    else:
        device = "cpu"
        dtype = torch.float32 # Use float32 on CPU for maximum compatibility

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,       # Use determined dtype
        device_map="auto" if device == "cuda" else None # Only use device_map for CUDA
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        do_sample=False,
        temperature=0.0,
        top_p=0.9,
    )
    print(f"Successfully loaded LLM: {model_name} on {device.upper()}.")
    print("-" * 50)

except Exception as e:
    print(f"Error loading LLM model '{model_name}': {e}")
    print("\n--- Troubleshooting Tips for LLM Loading ---")
    print("1. **Check RAM/VRAM:** Ensure you have enough memory (e.g., 2GB+ for TinyLlama).")
    print("2. **Compatibility:** If `torch_dtype` or `device_map` cause issues, review the comments in the `try` block for `AutoModelForCausalLM.from_pretrained`.")
    print("3. **Corrupted Download:** If it persists, delete the model from your Hugging Face cache and try again.")
    print(f"   (e.g., `huggingface-cli delete-cache {model_name}`)")
    llm_pipeline = None


def extract_info_with_llm(sentence, llm_pipeline):
    """
    Uses an LLM to extract performance keywords and their values from a sentence.
    """
    if not llm_pipeline:
        return []

    messages = [
        {"role": "system", "content": "You are an expert at extracting performance metrics from text. Extract keywords and their numerical values."},
        {"role": "user", "content": f"""Extract performance keywords and their corresponding numerical values from the following sentence.
        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.
        Output the information as a JSON list, where each item is an object with 'keyword' and 'value' fields. The value should be a number (integer or float).
        If no performance keywords and values are found, output an empty JSON list [].

        Sentence: "{sentence}"
        Output:
        """}
    ]

    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    try:
        # --- THE FIX: Pass formatted_prompt as the 'inputs' keyword argument ---
        generated_text = llm_pipeline(
            inputs=formatted_prompt, # <--- THIS IS THE CHANGE
            stop_sequence=["\n\n", "```", "\nOutput:", "\n[]"]
        )
        
        print("\n--- LLM Raw Full Output (for debugging) ---")
        full_conversation_output = generated_text[0]['generated_text']
        print(full_conversation_output) 
        print("---------------------------------------------")

        llm_output = full_conversation_output.replace(formatted_prompt, "").strip()
        
        print("\n--- Extracted LLM Response for Parsing ---")
        print(llm_output)
        print("------------------------------------------")

        # --- Enhanced JSON Parsing (remains the same) ---
        json_code_block_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
        if json_code_block_match:
            json_string_candidate = json_code_block_match.group(1)
            print("Found JSON in code block.") 
        else:
            json_string_candidate = llm_output
            print("No JSON code block found, trying direct parse.") 

        json_start = json_string_candidate.find('[')
        json_end = json_string_candidate.rfind(']')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            json_string = json_string_candidate[json_start : json_end + 1]
            json_string = json_string.strip()
            print(f"Attempting to parse JSON string: {json_string}") 

            try:
                extracted_data = json.loads(json_string)
                if isinstance(extracted_data, list):
                    valid_extractions = []
                    for item in extracted_data:
                        if isinstance(item, dict) and 'keyword' in item and 'value' in item:
                            try:
                                item['value'] = float(item['value'])
                                if item['value'] == int(item['value']):
                                    item['value'] = int(item['value'])
                                valid_extractions.append(item)
                            except ValueError:
                                print(f"Warning: Invalid value '{item.get('value')}' for keyword '{item.get('keyword')}'. Skipping.") 
                                pass 
                    return valid_extractions
                print("Warning: JSON parsed but not a list.") 
                return []
            except json.JSONDecodeError as jde:
                print(f"Warning: Could not parse JSON from LLM output: {jde}")
                print(f"LLM Raw Output snippet causing error: {json_string_candidate[:200]}...")
                return []
        else:
            print(f"Warning: LLM did not output expected JSON structure ([...]). Raw output snippet: {llm_output[:200]}...")
            return []

    except Exception as e:
        print(f"An error occurred during LLM inference: {e}")
        return []

# --- Rest of your existing code (Document extraction, main, dummy doc creation) ---
def extract_sentences_from_docx(file_path):
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')
    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        document_content = "\n".join(full_text)
        sentences = sentence_endings.split(document_content)
        for sentence in sentences:
            stripped_sentence = sentence.strip()
            if stripped_sentence:
                yield stripped_sentence
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    if not llm_pipeline:
        print("LLM pipeline not initialized. Cannot perform LLM-based extraction.")
        return

    dummy_doc_path = "dummy_doc.docx"
    if not os.path.exists(dummy_doc_path):
        print(f"\n--- Creating a dummy '{dummy_doc_path}' for testing ---")
        doc = Document()
        doc.add_paragraph("The model achieved a recall of 0.92 and a precision of 0.88. Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds.")
        doc.add_paragraph("The error rate was 0.01% in the last quarter. Expected uptime is 99.9% for the server.")
        doc.add_paragraph("This sentence has no specific performance metrics. Accuracy improved to 95.5%.")
        doc.add_paragraph("Project budget was $1,250,000. Client satisfaction improved by 20%.")
        doc.save(dummy_doc_path)
        print(f"Dummy document created at: {dummy_doc_path}")

    file_path = dummy_doc_path 

    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print(f"\nProcessing document '{os.path.basename(file_path)}' for performance metrics using LLM...")
    found_any = False
    
    number_regex = re.compile(r'\d') 

    for sentence in extract_sentences_from_docx(file_path):
        if number_regex.search(sentence):
            print(f"\nProcessing Sentence: \"{sentence}\"")
            extracted_metrics = extract_info_with_llm(sentence, llm_pipeline)
            
            if extracted_metrics:
                found_any = True
                print("Extracted Metrics:")
                for item in extracted_metrics:
                    print(f"  - Keyword: '{item['keyword']}', Value: {item['value']}")
                print("=" * 70)
            else:
                print("No performance metrics extracted for this sentence.")
                print("=" * 70)

    if not found_any:
        print("No performance metrics found in the entire document using the LLM.")

    print("\n--- LLM-based Extraction Notes ---")
    print("This approach leverages the LLM's understanding of instructions and context.")
    print("Performance (speed and accuracy) will depend on:")
    print("1. The chosen LLM model (smaller models are faster but less accurate).")
    print("2. Your hardware (GPU is highly recommended for LLMs).")
    print("3. The clarity and specificity of the prompt engineering.")
    print("Errors in LLM output (e.g., malformed JSON) are possible and handled with error checks.")


if __name__ == "__main__":
    main()

Device set to use cpu


Successfully loaded LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 on CPU.
--------------------------------------------------

Processing document 'dummy_doc.docx' for performance metrics using LLM...

Processing Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "The error rate was 0.01% in the last quarter."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "Expected uptime is 99.9% for the se

In [12]:
import re
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os 

# --- 1. Load the LLM for Extraction (no changes here as loading is successful) ---
llm_pipeline = None

try:
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

    # Determine device for optimal loading
    if torch.cuda.is_available():
        device = "cuda"
        dtype = torch.bfloat16
        try:
            _ = torch.randn(1, 1).to(device).to(torch.bfloat16)
        except Exception:
            print("Warning: bfloat16 not supported by GPU, falling back to float16.")
            dtype = torch.float16
    else:
        device = "cpu"
        dtype = torch.float32 # Use float32 on CPU for maximum compatibility

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,       # Use determined dtype
        device_map="auto" if device == "cuda" else None # Only use device_map for CUDA
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        do_sample=False,
        temperature=0.0,
        top_p=0.9,
    )
    print(f"Successfully loaded LLM: {model_name} on {device.upper()}.")
    print("-" * 50)

except Exception as e:
    print(f"Error loading LLM model '{model_name}': {e}")
    print("\n--- Troubleshooting Tips for LLM Loading ---")
    print("1. **Check RAM/VRAM:** Ensure you have enough memory (e.g., 2GB+ for TinyLlama).")
    print("2. **Compatibility:** If `torch_dtype` or `device_map` cause issues, review the comments in the `try` block for `AutoModelForCausalLM.from_pretrained`.")
    print("3. **Corrupted Download:** If it persists, delete the model from your Hugging Face cache and try again.")
    print(f"   (e.g., `huggingface-cli delete-cache {model_name}`)")
    llm_pipeline = None


def extract_info_with_llm(sentence, llm_pipeline):
    """
    Uses an LLM to extract performance keywords and their values from a sentence.
    """
    if not llm_pipeline:
        return []

    messages = [
        {"role": "system", "content": "You are an expert at extracting performance metrics from text. Extract keywords and their numerical values."},
        {"role": "user", "content": f"""Extract performance keywords and their corresponding numerical values from the following sentence.
        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.
        Output the information as a JSON list, where each item is an object with 'keyword' and 'value' fields. The value should be a number (integer or float).
        If no performance keywords and values are found, output an empty JSON list [].

        Sentence: "{sentence}"
        Output:
        """}
    ]

    # Manually apply the chat template to get the exact string the model expects.
    # `add_generation_prompt=True` adds the final token (e.g., "<|assistant|>\n")
    # that tells the model it's its turn to generate.
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    try:
        # --- THE FINAL ATTEMPT AT THE FIX: Pass formatted_prompt as a positional argument ---
        generated_text = llm_pipeline(
            formatted_prompt, # <--- PASS THE STRING DIRECTLY AS POSITIONAL ARGUMENT
            stop_sequence=["\n\n", "```", "\nOutput:", "\n[]"]
        )
        
        print("\n--- LLM Raw Full Output (for debugging) ---")
        full_conversation_output = generated_text[0]['generated_text']
        print(full_conversation_output) 
        print("---------------------------------------------")

        # Now, `llm_output` needs to be extracted from the *end* of the conversation.
        # It's everything after the `formatted_prompt` itself.
        llm_output = full_conversation_output.replace(formatted_prompt, "").strip()
        
        print("\n--- Extracted LLM Response for Parsing ---")
        print(llm_output)
        print("------------------------------------------")

        # --- Enhanced JSON Parsing (remains the same) ---
        json_code_block_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
        if json_code_block_match:
            json_string_candidate = json_code_block_match.group(1)
            print("Found JSON in code block.") 
        else:
            json_string_candidate = llm_output
            print("No JSON code block found, trying direct parse.") 

        json_start = json_string_candidate.find('[')
        json_end = json_string_candidate.rfind(']')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            json_string = json_string_candidate[json_start : json_end + 1]
            json_string = json_string.strip()
            print(f"Attempting to parse JSON string: {json_string}") 

            try:
                extracted_data = json.loads(json_string)
                if isinstance(extracted_data, list):
                    valid_extractions = []
                    for item in extracted_data:
                        if isinstance(item, dict) and 'keyword' in item and 'value' in item:
                            try:
                                item['value'] = float(item['value'])
                                if item['value'] == int(item['value']):
                                    item['value'] = int(item['value'])
                                valid_extractions.append(item)
                            except ValueError:
                                print(f"Warning: Invalid value '{item.get('value')}' for keyword '{item.get('keyword')}'. Skipping.") 
                                pass 
                    return valid_extractions
                print("Warning: JSON parsed but not a list.") 
                return []
            except json.JSONDecodeError as jde:
                print(f"Warning: Could not parse JSON from LLM output: {jde}")
                print(f"LLM Raw Output snippet causing error: {json_string_candidate[:200]}...")
                return []
        else:
            print(f"Warning: LLM did not output expected JSON structure ([...]). Raw output snippet: {llm_output[:200]}...")
            return []

    except Exception as e:
        print(f"An error occurred during LLM inference: {e}")
        return []

# --- Rest of your existing code (Document extraction, main, dummy doc creation) ---
def extract_sentences_from_docx(file_path):
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')
    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        document_content = "\n".join(full_text)
        sentences = sentence_endings.split(document_content)
        for sentence in sentences:
            stripped_sentence = sentence.strip()
            if stripped_sentence:
                yield stripped_sentence
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    if not llm_pipeline:
        print("LLM pipeline not initialized. Cannot perform LLM-based extraction.")
        return

    dummy_doc_path = "dummy_doc.docx"
    if not os.path.exists(dummy_doc_path):
        print(f"\n--- Creating a dummy '{dummy_doc_path}' for testing ---")
        doc = Document()
        doc.add_paragraph("The model achieved a recall of 0.92 and a precision of 0.88. Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds.")
        doc.add_paragraph("The error rate was 0.01% in the last quarter. Expected uptime is 99.9% for the server.")
        doc.add_paragraph("This sentence has no specific performance metrics. Accuracy improved to 95.5%.")
        doc.add_paragraph("Project budget was $1,250,000. Client satisfaction improved by 20%.")
        doc.save(dummy_doc_path)
        print(f"Dummy document created at: {dummy_doc_path}")

    file_path = dummy_doc_path 

    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print(f"\nProcessing document '{os.path.basename(file_path)}' for performance metrics using LLM...")
    found_any = False
    
    number_regex = re.compile(r'\d') 

    for sentence in extract_sentences_from_docx(file_path):
        if number_regex.search(sentence):
            print(f"\nProcessing Sentence: \"{sentence}\"")
            extracted_metrics = extract_info_with_llm(sentence, llm_pipeline)
            
            if extracted_metrics:
                found_any = True
                print("Extracted Metrics:")
                for item in extracted_metrics:
                    print(f"  - Keyword: '{item['keyword']}', Value: {item['value']}")
                print("=" * 70)
            else:
                print("No performance metrics extracted for this sentence.")
                print("=" * 70)

    if not found_any:
        print("No performance metrics found in the entire document using the LLM.")

    print("\n--- LLM-based Extraction Notes ---")
    print("This approach leverages the LLM's understanding of instructions and context.")
    print("Performance (speed and accuracy) will depend on:")
    print("1. The chosen LLM model (smaller models are faster but less accurate).")
    print("2. Your hardware (GPU is highly recommended for LLMs).")
    print("3. The clarity and specificity of the prompt engineering.")
    print("Errors in LLM output (e.g., malformed JSON) are possible and handled with error checks.")


if __name__ == "__main__":
    main()

Device set to use cpu


Successfully loaded LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 on CPU.
--------------------------------------------------

Processing document 'dummy_doc.docx' for performance metrics using LLM...

Processing Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sentence.

Processing Sentence: "Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sentence.

Processing Sentence: "The error rate was 0.01% in the last quarter."
An error occurred during LLM inference: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
No performance metrics extracted for this sente

In [13]:
import re
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os 

# --- 1. Load the LLM for Extraction (no changes here as loading is successful) ---
llm_pipeline = None

try:
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

    # Determine device for optimal loading
    if torch.cuda.is_available():
        device = "cuda"
        dtype = torch.bfloat16
        try:
            _ = torch.randn(1, 1).to(device).to(torch.bfloat16)
        except Exception:
            print("Warning: bfloat16 not supported by GPU, falling back to float16.")
            dtype = torch.float16
    else:
        device = "cpu"
        dtype = torch.float32 # Use float32 on CPU for maximum compatibility
    print(f"Device set to use {device}") # Add this to confirm device


    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,       # Use determined dtype
        device_map="auto" if device == "cuda" else None # Only use device_map for CUDA
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Important: Set pad_token_id for generation, especially if batching or varying lengths
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        print("Set tokenizer.pad_token_id to tokenizer.eos_token_id")
    
    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        do_sample=False,
        temperature=0.0,
        top_p=0.9,
    )
    print(f"Successfully loaded LLM: {model_name} on {device.upper()}.")
    print("-" * 50)

except Exception as e:
    print(f"Error loading LLM model '{model_name}': {e}")
    print("\n--- Troubleshooting Tips for LLM Loading ---")
    print("1. **Check RAM/VRAM:** Ensure you have enough memory (e.g., 2GB+ for TinyLlama).")
    print("2. **Compatibility:** If `torch_dtype` or `device_map` cause issues, review the comments in the `try` block for `AutoModelForCausalLM.from_pretrained`.")
    print("3. **Corrupted Download:** If it persists, delete the model from your Hugging Face cache and try again.")
    print(f"   (e.g., `huggingface-cli delete-cache {model_name}`)")
    llm_pipeline = None


def extract_info_with_llm(sentence, llm_pipeline):
    """
    Uses an LLM to extract performance keywords and their values from a sentence.
    """
    if not llm_pipeline:
        return []

    messages = [
        {"role": "system", "content": "You are an expert at extracting performance metrics from text. Extract keywords and their numerical values."},
        {"role": "user", "content": f"""Extract performance keywords and their corresponding numerical values from the following sentence.
        Examples of performance keywords include "recall", "precision", "accuracy", "f1-score", "throughput", "latency", "response time", "error rate", "conversion rate", "uptime", etc.
        Output the information as a JSON list, where each item is an object with 'keyword' and 'value' fields. The value should be a number (integer or float).
        If no performance keywords and values are found, output an empty JSON list [].

        Sentence: "{sentence}"
        Output:
        """}
    ]

    # Manually apply the chat template to get the exact string.
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    try:
        # --- THE NEW ATTEMPT AT THE FIX: Tokenize manually and pass inputs_ids ---
        # This explicitly prepares the input as numerical IDs, which is what the model's
        # underlying forward pass ultimately expects.
        
        # Tokenize the prompt
        inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)
        # Move inputs to the correct device (CPU in your case)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Now call the pipeline with the tokenized inputs.
        # This forces the pipeline to use your pre-tokenized input,
        # bypassing its internal tokenization logic that might be causing the error.
        generated_text = llm_pipeline(
            **inputs, # Unpack the dictionary of input_ids and attention_mask
            stop_sequence=["\n\n", "```", "\nOutput:", "\n[]"],
            # Ensure the max_new_tokens isn't cutting off the prompt prematurely
            max_new_tokens=100, # Re-confirm this as it might interact
            pad_token_id=tokenizer.pad_token_id # Explicitly pass pad_token_id
        )
        
        print("\n--- LLM Raw Full Output (for debugging) ---")
        full_conversation_output = generated_text[0]['generated_text']
        print(full_conversation_output) 
        print("---------------------------------------------")

        # Now, `llm_output` needs to be extracted from the *end* of the conversation.
        # It's everything after the `formatted_prompt` itself.
        llm_output = full_conversation_output.replace(formatted_prompt, "").strip()
        
        print("\n--- Extracted LLM Response for Parsing ---")
        print(llm_output)
        print("------------------------------------------")

        # --- Enhanced JSON Parsing (remains the same) ---
        json_code_block_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
        if json_code_block_match:
            json_string_candidate = json_code_block_match.group(1)
            print("Found JSON in code block.") 
        else:
            json_string_candidate = llm_output
            print("No JSON code block found, trying direct parse.") 

        json_start = json_string_candidate.find('[')
        json_end = json_string_candidate.rfind(']')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            json_string = json_string_candidate[json_start : json_end + 1]
            json_string = json_string.strip()
            print(f"Attempting to parse JSON string: {json_string}") 

            try:
                extracted_data = json.loads(json_string)
                if isinstance(extracted_data, list):
                    valid_extractions = []
                    for item in extracted_data:
                        if isinstance(item, dict) and 'keyword' in item and 'value' in item:
                            try:
                                item['value'] = float(item['value'])
                                if item['value'] == int(item['value']):
                                    item['value'] = int(item['value'])
                                valid_extractions.append(item)
                            except ValueError:
                                print(f"Warning: Invalid value '{item.get('value')}' for keyword '{item.get('keyword')}'. Skipping.") 
                                pass 
                    return valid_extractions
                print("Warning: JSON parsed but not a list.") 
                return []
            except json.JSONDecodeError as jde:
                print(f"Warning: Could not parse JSON from LLM output: {jde}")
                print(f"LLM Raw Output snippet causing error: {json_string_candidate[:200]}...")
                return []
        else:
            print(f"Warning: LLM did not output expected JSON structure ([...]). Raw output snippet: {llm_output[:200]}...")
            return []

    except Exception as e:
        print(f"An error occurred during LLM inference: {e}")
        return []

# --- Rest of your existing code (Document extraction, main, dummy doc creation) ---
def extract_sentences_from_docx(file_path):
    sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|!)\s+|(?<=\.|\?|!)"?\s+')
    try:
        document = Document(file_path)
        full_text = []
        for paragraph in document.paragraphs:
            full_text.append(paragraph.text)
        document_content = "\n".join(full_text)
        sentences = sentence_endings.split(document_content)
        for sentence in sentences:
            stripped_sentence = sentence.strip()
            if stripped_sentence:
                yield stripped_sentence
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while reading the Word document: {e}")
        return

def main():
    if not llm_pipeline:
        print("LLM pipeline not initialized. Cannot perform LLM-based extraction.")
        return

    dummy_doc_path = "dummy_doc.docx"
    if not os.path.exists(dummy_doc_path):
        print(f"\n--- Creating a dummy '{dummy_doc_path}' for testing ---")
        doc = Document()
        doc.add_paragraph("The model achieved a recall of 0.92 and a precision of 0.88. Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds.")
        doc.add_paragraph("The error rate was 0.01% in the last quarter. Expected uptime is 99.9% for the server.")
        doc.add_paragraph("This sentence has no specific performance metrics. Accuracy improved to 95.5%.")
        doc.add_paragraph("Project budget was $1,250,000. Client satisfaction improved by 20%.")
        doc.save(dummy_doc_path)
        print(f"Dummy document created at: {dummy_doc_path}")

    file_path = dummy_doc_path 

    if not os.path.exists(file_path):
        print(f"Error: File does not exist at the specified path: {file_path}")
        return
    
    if not file_path.lower().endswith('.docx'):
        print(f"Error: The provided file is not a .docx document. Please provide a Word document.")
        return

    print(f"\nProcessing document '{os.path.basename(file_path)}' for performance metrics using LLM...")
    found_any = False
    
    number_regex = re.compile(r'\d') 

    for sentence in extract_sentences_from_docx(file_path):
        if number_regex.search(sentence):
            print(f"\nProcessing Sentence: \"{sentence}\"")
            extracted_metrics = extract_info_with_llm(sentence, llm_pipeline)
            
            if extracted_metrics:
                found_any = True
                print("Extracted Metrics:")
                for item in extracted_metrics:
                    print(f"  - Keyword: '{item['keyword']}', Value: {item['value']}")
                print("=" * 70)
            else:
                print("No performance metrics extracted for this sentence.")
                print("=" * 70)

    if not found_any:
        print("No performance metrics found in the entire document using the LLM.")

    print("\n--- LLM-based Extraction Notes ---")
    print("This approach leverages the LLM's understanding of instructions and context.")
    print("Performance (speed and accuracy) will depend on:")
    print("1. The chosen LLM model (smaller models are faster but less accurate).")
    print("2. Your hardware (GPU is highly recommended for LLMs).")
    print("3. The clarity and specificity of the prompt engineering.")
    print("Errors in LLM output (e.g., malformed JSON) are possible and handled with error checks.")


if __name__ == "__main__":
    main()

Device set to use cpu


Device set to use cpu


Successfully loaded LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 on CPU.
--------------------------------------------------

Processing document 'dummy_doc.docx' for performance metrics using LLM...

Processing Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "Our system's throughput reached 1200 requests per second, with a latency of only 50 milliseconds."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "The error rate was 0.01% in the last quarter."
An error occurred during LLM inference: __call__() missing 1 required positional argument: 'text_inputs'
No performance metrics extracted for this sentence.

Processing Sentence: "Expected uptime is 99.9% for the se

In [15]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import re
import json

# Load a small, instruction-tuned model
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def extract_metrics_with_flan(sentences):
    results = []

    for sentence in sentences:
        prompt = f"""Extract all performance metrics and their numeric values from the following sentence. Return as JSON with metric names as keys and values as floats. Convert percentages to float (e.g., 95.5% → 95.5).

Sentence: "{sentence}"
"""

        try:
            response = generator(prompt, max_new_tokens=128)[0]["generated_text"]

            # Try to extract a valid JSON object from the response
            match = re.search(r"\{.*\}", response, re.DOTALL)
            if match:
                result = json.loads(match.group())
            else:
                result = {}
        except Exception as e:
            print(f"Error: {e}")
            result = {}

        results.append(result)

    return results

# --- Example usage ---
sentences = [
    "The model achieved a recall of 0.92 and a precision of 0.88.",
    "Accuracy improved to 95.5%.",
    "F1-score was 0.81 while ROC AUC reached 0.93.",
    "RMSE is 3.5 and R2 score is 0.89.",
    "The model’s sensitivity dropped to 74.2% while specificity remained at 89%.",
    "Area under curve is around 91.7 percent with a mean absolute error of 1.2."
]

results = extract_metrics_with_flan(sentences)
for sent, metrics in zip(sentences, results):
    print(f"Sentence: {sent}\nExtracted: {metrics}\n")


Device set to use cpu


Sentence: The model achieved a recall of 0.92 and a precision of 0.88.
Extracted: {}

Sentence: Accuracy improved to 95.5%.
Extracted: {}

Sentence: F1-score was 0.81 while ROC AUC reached 0.93.
Extracted: {}

Sentence: RMSE is 3.5 and R2 score is 0.89.
Extracted: {}

Sentence: The model’s sensitivity dropped to 74.2% while specificity remained at 89%.
Extracted: {}

Sentence: Area under curve is around 91.7 percent with a mean absolute error of 1.2.
Extracted: {}



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import re
import json

# Use Phi-2 with the correct class
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)

def extract_metric_names(sentences):
    results = []

    for sentence in sentences:
        prompt = f"""Extract only the names of performance metrics mentioned in the following sentence. Ignore values. Return the result as a Python list of strings.

Sentence: "{sentence}"
"""

        try:
            response = generator(prompt, do_sample=False)[0]["generated_text"]

            # Extract Python-style list
            match = re.search(r"\[.*?\]", response, re.DOTALL)
            if match:
                result = eval(match.group(), {"__builtins__": None}, {})
                if isinstance(result, list):
                    results.append(result)
                else:
                    results.append([])
            else:
                results.append([])
        except Exception as e:
            print(f"Error: {e}")
            results.append([])

    return results

# --- Example Usage ---
sentences = [
    "The model achieved a recall of 0.92 and a precision of 0.88.",
    "Accuracy improved to 95.5%.",
    "F1-score was 0.81 while ROC AUC reached 0.93.",
    "RMSE is 3.5 and R2 score is 0.89.",
    "The model’s sensitivity dropped to 74.2% while specificity remained at 89%.",
    "Area under curve is around 91.7 percent with a mean absolute error of 1.2."
]

results = extract_metric_names(sentences)
for sent, metrics in zip(sentences, results):
    print(f"Sentence: {sent}\nExtracted Metric Names: {metrics}\n")


### Here, restart the kernel... Following 2 Cell Takes ***TREMENDOUSLY*** long to execute

### ✅ Example Prompt with Few-Shot Learning

In [17]:
prompt = """
Extract only the names of performance metrics mentioned in each sentence. Ignore values. 
Return the result as a Python list of strings.

Example 1:
Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
Output: ["recall", "precision"]

Example 2:
Sentence: "Accuracy improved to 95.5%."
Output: ["accuracy"]

Example 3:
Sentence: "F1-score was 0.81 while ROC AUC reached 0.93."
Output: ["f1-score", "roc auc"]

Now extract from this sentence:
Sentence: "The model’s sensitivity dropped to 74.2% while specificity remained at 89%."
Output:
"""


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import re, json

#model_name = "microsoft/phi-2"
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=64)

few_shot_prefix = """
Extract only the names of performance metrics mentioned in each sentence. Ignore values. 
Return the result as a Python list of strings.

Example 1:
Sentence: "The model achieved a recall of 0.92 and a precision of 0.88."
Output: ["recall", "precision"]

Example 2:
Sentence: "Accuracy improved to 95.5%."
Output: ["accuracy"]

Example 3:
Sentence: "F1-score was 0.81 while ROC AUC reached 0.93."
Output: ["f1-score", "roc auc"]

Now extract from this sentence:
"""

def extract_metric_names_fewshot(sentences):
    results = []
    for sentence in sentences:
        prompt = few_shot_prefix + f'Sentence: "{sentence}"\nOutput:'

        try:
            output = generator(prompt, do_sample=False)[0]["generated_text"]
            match = re.search(r"\[.*?\]", output)
            result = eval(match.group(), {"__builtins__": None}, {}) if match else []
        except:
            result = []
        results.append(result)
    return results

# --- Example ---
sentences = [
    "The model’s sensitivity dropped to 74.2% while specificity remained at 89%.",
    "Area under curve is around 91.7 percent with a mean absolute error of 1.2."
]

results = extract_metric_names_fewshot(sentences)
for s, r in zip(sentences, results):
    print(f"Sentence: {s}\nExtracted Metric Names: {r}\n")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
import transformers

transformers.__version__

1. Choose a Base Model
Recommended small LLMs:

google/flan-t5-small or flan-t5-base (Seq2Seq, good for summarization)

MBZUAI/LaMini-Flan-T5-783M (instruction-tuned, compact)

microsoft/phi-2 (causal LM, use for generation, not summarization directly)

2. Prepare Your Dataset
For summarization:

json
Copy
Edit
{"text": "Long text here...", "summary": "Concise summary."}
For text generation:

json
Copy
Edit
{"prompt": "Write a poem about stars.", "output": "Stars shine bright in the night..."}
Format: Use Hugging Face datasets library or CSV/JSONL.

3. Fine-Tune (with or without LoRA)
🔁 Option A: Full Fine-Tuning (simple, but heavy)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load your dataset
dataset = load_dataset("json", data_files={"train": "performance_metrics_dataset.json"})

# Tokenize
def preprocess(example):
    input_text = example["text"]
    target_text = example["summary"]
    model_inputs = tokenizer(input_text, max_length=512, truncation=True)
    labels = tokenizer(target_text, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

# Training setup
args = Seq2SeqTrainingArguments(
    output_dir="./flan-summarizer",
    #evaluation_strategy="no",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    fp16=False,  # Set True if using GPU with mixed precision
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

trainer.train()


🔁 Option B: Parameter-Efficient Tuning with LoRA (Recommended for low-resource)
Use peft + trl + transformers (works great with T5, FLAN, etc.).

bash
Copy
Edit
pip install peft transformers accelerate datasets trl
Would you like a full LoRA script template?

4. Inference After Fine-Tuning

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint_path = "./flan-summarizer/checkpoint-75"  # use actual checkpoint path

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")  # or use checkpoint_path if tokenizer was saved too
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Test
text = "The model achieved a recall of 0.92 and a precision of 0.88."
print(summarizer(text, max_length = 4))

In [None]:
import json

# Load the test dataset
with open("test_performance_metrics_dataset.json", "r") as f:
    test_data = json.load(f)

    
texts = [sample["text"] for sample in test_data]


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model (adjust path as needed)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("./flan-summarizer/checkpoint-75")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Run inference
predictions = [summarizer(text)[0]["summary_text"] for text in texts]


In [None]:
for i, sample in enumerate(test_data[:5]):  # show first 5 examples
    print("Input:", sample["text"])
    print("Expected:", sample["summary"])
    print("Predicted:", predictions[i])
    print("-" * 50)


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

text = "The model achieved a recall of 0.92 and a precision of 0.88."
print(summarizer(text, max_length = 5))

Consider you are an expert in python and LLM fine-tuning. Provide code to fine-tune the NER LLM model. Keep in mind the NER should tag the performance metrics keyword as "PERF_KEY". This list should be be dynamic and should not be hardcoded

1. 📦 Install Required Packages (if not already installed)
   pip install transformers datasets seqeval

2. 🧠 Prepare the Training Script

In [None]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

# Example data - Replace with your own dataset
examples = [
    {
        "tokens": ["The", "model", "achieved", "a", "recall", "of", "0.92", "and", "precision", "of", "0.88", "."],
        "ner_tags": ["O", "O", "O", "O", "B-PERF_KEY", "O", "O", "O", "B-PERF_KEY", "O", "O", "O"]
    },
    {
        "tokens": ["Accuracy", "improved", "to", "95.5", "%", "."],
        "ner_tags": ["B-PERF_KEY", "O", "O", "O", "O", "O"]
    },
    {
        "tokens": ["The", "results", "are", "under", "review", "."],
        "ner_tags": ["O", "O", "O", "O", "O", "O"]
    }
]

# Create Hugging Face dataset
dataset = Dataset.from_list(examples)
label_list = ["O", "B-PERF_KEY"]

# Tokenizer and model
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["ner_tags"][word_idx])
        else:
            label_ids.append(example["ner_tags"][word_idx] if True else -100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# Load model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Training arguments
args = TrainingArguments(
    output_dir="./ner_perf_key_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Use seqeval metric
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


The error you're encountering is due to trying to store a Python object ('O')—likely a non-integer label like a string—into a column that is expected to be an integer (specifically, the "labels" field in your dataset). The Hugging Face datasets library (which uses Apache Arrow under the hood) is strict about data types.

Here’s how to fix it:

🛠 The Problem
In your tokenize_and_align_labels function, you're likely assigning labels that are still strings (like "O", "B-LOC", etc.) rather than their corresponding integer IDs.



✅ The Fix
You must convert each label to its integer ID using the label_to_id mapping from your model config or dataset.

✅ Solution – Ensure Proper Label Mapping
Here’s how to properly handle this in tokenize_and_align_labels:

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            label = example["ner_tags"][word_idx]  # Assuming labels are already IDs
            labels.append(label)
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



🔁 If example["ner_tags"] are strings, you need to map them:
label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2, ...}  # example
label = label2id[example["ner_tags"][word_idx]]



🔍 What To Do Next
Inspect your ner_tags column:
print(dataset[0]["ner_tags"])  # Are these strings or integers?


If they are strings, convert them using label2id.
Retry the mapping step:
    tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

# 1. Example data: replace with your actual dataset
data = {
    "tokens": [
        ["The", "model", "achieved", "95", "%", "accuracy", "on", "CIFAR-10"],
        ["Precision", "was", "around", "90", "%"]
    ],
    "ner_tags": [
        ["O", "O", "O", "B-PERF_VAL", "I-PERF_VAL", "B-PERF_KEY", "O", "O"],
        ["B-PERF_KEY", "O", "O", "B-PERF_VAL", "I-PERF_VAL"]
    ]
}


data = {
    "tokens": [
        ["The", "model", "achieved", "95", "%", "accuracy", "on", "CIFAR-10"],
        ["Precision", "was", "around", "90", "%"]
    ],
    "ner_tags": [
        ["O", "O", "O", "O", "O", "B-PERF_KEY", "O", "O"],
        ["B-PERF_KEY", "O", "O", "O", "O"]
    ]
}

dataset = Dataset.from_dict(data)

# 2. Define label list and mapping
label_list = sorted(list(set(tag for seq in data["ner_tags"] for tag in seq)))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# 3. Convert string labels to integer IDs in dataset
def convert_tags_to_ids(example):
    example["ner_tags"] = [label2id[label] for label in example["ner_tags"]]
    return example

dataset = dataset.map(convert_tags_to_ids)

# 4. Tokenizer & Model
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 5. Tokenize & align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 6. Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Evaluation metric
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return seqeval.compute(predictions=true_predictions, references=true_labels)

# 8. Training Arguments
training_args = TrainingArguments(
    output_dir="./ner_model_perf_key",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train
trainer.train()


3. 📈 How to Predict
After training, you can use this model for inference:

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
example = "The model achieved 97% accuracy and 0.88 precision."
ner_pipeline(example)


In [1]:
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from datasets import load_dataset, ClassLabel
from seqeval.metrics import classification_report, f1_score

# 1. Load Dataset from JSON
dataset = load_dataset("json", data_files="ner_performance_metrics_dataset.json")["train"]

# 2. Extract unique labels and create mappings
label_list = sorted({label for example in dataset for label in example["ner_tags"]})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# 3. Map NER tags to label IDs
def encode_labels(example):
    example["label_ids"] = [label_to_id[label] for label in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

# 4. Load tokenizer and model checkpoint
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id)

# 5. Tokenization and alignment
label_all_tokens = True

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["label_ids"][word_idx])
        else:
            label_ids.append(example["label_ids"][word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 6. Split dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[label] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=True),
    }

# 9. Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model_perf_key",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train
trainer.train()





Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
import json

# Load the JSON file
with open("ner_performance_metrics_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Check for mismatches
mismatches = [
    i for i, item in enumerate(data)
    if len(item["tokens"]) != len(item["ner_tags"])
]

# Print results
if mismatches:
    print(f"Found {len(mismatches)} mismatched entries at indices: {mismatches}")
else:
    print("✅ All token and ner_tag lists are of equal length.")


In [None]:
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score

# 1. Load Dataset from JSON
dataset = load_dataset("json", data_files="ner_performance_metrics_dataset.json")["train"]

# 2. Extract unique labels and create mappings
label_list = sorted({label for example in dataset for label in example["ner_tags"]})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# 3. Map NER tags to label IDs
def encode_labels(example):
    example["label_ids"] = [label_to_id[label] for label in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

# 4. Load tokenizer and model checkpoint
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)

# 5. Tokenization and alignment
label_all_tokens = True

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        padding="max_length",
        is_split_into_words=True,
        max_length=128,
    )
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["label_ids"][word_idx])
        else:
            label_ids.append(example["label_ids"][word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx

    # Manually ensure label_ids match input_ids length
    input_len = len(tokenized_inputs["input_ids"])
    if len(label_ids) < input_len:
        label_ids += [-100] * (input_len - len(label_ids))
    elif len(label_ids) > input_len:
        label_ids = label_ids[:input_len]

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

def validate_and_fix_labels(example):
    labels = example["labels"]
    input_len = len(example["input_ids"])

    # Flatten nested lists if any
    if any(isinstance(lbl, list) for lbl in labels):
        flat_labels = []
        for lbl in labels:
            if isinstance(lbl, list):
                flat_labels.extend(lbl)
            else:
                flat_labels.append(lbl)
        labels = flat_labels

    # Ensure all labels are ints
    labels = [int(lbl) for lbl in labels]

    # Pad with -100 if shorter than input_ids
    if len(labels) < input_len:
        labels = labels + [-100] * (input_len - len(labels))
    # Truncate if longer
    else:
        labels = labels[:input_len]

    example["labels"] = labels

    # Optional validation print (comment out in production)
    print(f"Input length: {input_len}, Labels length: {len(labels)}")
    print(f"Labels sample: {labels[:20]}")

    return example

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)
# Apply this function to your tokenized dataset
tokenized_dataset = tokenized_dataset.map(validate_and_fix_labels)



print(tokenized_dataset[0]['labels'])
print(type(tokenized_dataset[0]['labels']))


# 6. Split dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(train_dataset[0].keys()) 


# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[label] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=True),
    }

# 9. Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model_perf_key",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",  # <- fixed typo
    save_strategy="epoch",
    report_to="none"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train
trainer.train()


In [None]:
for i in range(10):
    print(len(tokenized_dataset[0]['labels']), len(tokenized_dataset[0]['input_ids']))

In [None]:
len(tokenized_dataset)

In [None]:
all(isinstance(x, int) for x in tokenized_dataset[0]['labels'])

In [None]:
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification, # This will handle padding
)
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score

# 1. Load Dataset from JSON
dataset = load_dataset("json", data_files="ner_performance_metrics_dataset.json")["train"]

# 2. Extract unique labels and create mappings
label_list = sorted(list(set(label for example in dataset for label in example["ner_tags"])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# 3. Map NER tags to label IDs
def encode_labels(example):
    example["label_ids"] = [label_to_id[label] for label in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

# 4. Load tokenizer and model checkpoint
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)

# 5. Tokenization and alignment
label_all_tokens = True # Whether to label all sub-tokens or only the first one

def tokenize_and_align_labels(examples):
    # Do NOT apply padding="max_length" here. Let the DataCollator handle it.
    tokenized_inputs = tokenizer(
        examples["tokens"],
        #truncation=True, # Truncate if longer than max_length
        is_split_into_words=True,
        padding=True,
        #max_length=128, # Set max_length for truncation
        # padding=False or padding='do_not_pad' is default behavior without 'padding=True'
    )

    labels = []
    for batch_index, label_ids_example in enumerate(examples["label_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        previous_word_idx = None
        current_labels = []

        for word_idx in word_ids:
            if word_idx is None:
                # Special token (CLS, SEP, PAD) or token not associated with a word
                current_labels.append(-100)
            elif word_idx != previous_word_idx:
                # Start of a new word, label with the corresponding word's label
                current_labels.append(label_ids_example[word_idx])
            else:
                # Continuation of a word (subword token)
                # Label according to label_all_tokens
                current_labels.append(label_ids_example[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        
        labels.append(current_labels)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply this function to your dataset
# Using batched=True is efficient and allows processing multiple examples at once
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Verification prints (good for debugging)
print("Labels for the first example after tokenization and alignment (before DataCollator):")
print(tokenized_dataset[0]['labels'])
print("Type of labels for the first example:", type(tokenized_dataset[0]['labels']))
print("Length of input_ids for the first example:", len(tokenized_dataset[0]['input_ids']))
print("Length of labels for the first example:", len(tokenized_dataset[0]['labels']))


# 6. Split dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print("\nKeys in the first training example:")
print(train_dataset[0].keys())

# 7. Data collator
# This is where padding happens now.
# By default, DataCollatorForTokenClassification will pad to the longest sequence in the batch.
# If you want to force padding to max_length (e.g., 128), you can pass padding="max_length" here,
# but usually it's fine to let it pad dynamically per batch.
data_collator = DataCollatorForTokenClassification(tokenizer) 

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_labels = [
        [id_to_label[label] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    
    # Handle cases where true_labels or true_predictions might be empty
    if not any(true_labels) and not any(true_predictions):
        return {"f1": 1.0, "report": "No true labels or predictions found, returning 1.0 F1"} # Perfect score if nothing to predict
    elif not any(true_labels) or not any(true_predictions):
        return {"f1": 0.0, "report": "Missing true labels or predictions for F1 computation"} # Can't compute F1 if one is empty and other isn't
    
    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions, output_dict=True),
    }

# 9. Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model_perf_key",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # This is crucial
    compute_metrics=compute_metrics,
)

# 11. Train
print("\nStarting training...")
trainer.train()
print("Training complete.")

In [2]:
from datasets import DatasetDict

label_all_tokens = True

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",  # Ensure consistent length
        max_length=128,        # Adjust as needed
        return_tensors=None
    )

    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["label_ids"][word_idx])
        else:
            label_ids.append(example["label_ids"][word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Tokenize and align
tokenized_dataset = dataset.map(tokenize_and_align_labels, remove_columns=dataset.column_names)

# Ensure splits are applied after tokenization
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [3]:
def validate_tokenized_dataset(dataset, max_len=128):
    for idx, example in enumerate(dataset):
        labels = example["labels"]
        input_ids = example["input_ids"]

        # Check 1: labels is a list of ints
        if not isinstance(labels, list) or not all(isinstance(x, int) for x in labels):
            print(f"Example {idx}: 'labels' is not a flat list of ints")
            return False

        # Check 2: input_ids is same length as labels
        if len(labels) != len(input_ids):
            print(f"Example {idx}: length mismatch - labels ({len(labels)}) vs input_ids ({len(input_ids)})")
            return False

        # Check 3: length does not exceed max token length
        if len(labels) > max_len or len(input_ids) > max_len:
            print(f"Example {idx}: length exceeds max_length ({max_len})")
            return False

    print("✅ All examples are correctly formatted.")
    return True

# Run validation
validate_tokenized_dataset(train_dataset)
validate_tokenized_dataset(eval_dataset)


✅ All examples are correctly formatted.
✅ All examples are correctly formatted.


True

In [6]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 8
})

In [14]:
for i in range(8):
    print(train_dataset[i]['input_ids'])
    print(train_dataset[i]['token_type_ids'])
    print(train_dataset[i]['attention_mask'])
    print(train_dataset[i]['labels'])
    print("\n\n")



[101, 1109, 2235, 3890, 1126, 22341, 2794, 1104, 121, 119, 5966, 1113, 1103, 9221, 1891, 1383, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
for i in range(2):
    print(len(eval_dataset[i]['input_ids']))
    print(len(eval_dataset[i]['token_type_ids']))
    print(len(eval_dataset[i]['attention_mask']))
    print(len(eval_dataset[i]['labels']))
    print("\n\n")

128
128
128
128



128
128
128
128





In [15]:
max_length = 128  # or any length suitable for your model

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        padding='max_length',
        max_length=max_length,
        is_split_into_words=True
    )
    word_ids = tokenized_inputs.word_ids()
    label_ids = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["label_ids"][word_idx])
        else:
            label_ids.append(example["label_ids"][word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

In [26]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

# 1. Example data - Replace with your own dataset.  This is a small example.
examples = [
    {
        "tokens": ["John", "Smith", "went", "to", "New", "York", "."],
        "ner_tags": ["B-PER", "I-PER", "O", "O", "B-LOC", "I-LOC", "O"]
    },
    {
        "tokens": ["Apple", "Inc.", "is", "based", "in", "California", "."],
        "ner_tags": ["B-ORG", "I-ORG", "O", "O", "O", "B-LOC", "O"]
    },
    {
        "tokens": ["The", "meeting", "is", "on", "Monday", "."],
        "ner_tags": ["O", "O", "O", "O", "B-DATE", "O"]
    }
]

# 2. Create Hugging Face dataset
dataset = Dataset.from_list(examples)

# 3. Define the label list.  Crucially, get this from your data.
label_list = sorted(list(set(label for example in examples for label in example["ner_tags"])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()} #important for metrics

# 4. Tokenizer and model.  Using bert-base-cased here.
model_checkpoint = "bert-base-cased"  # Change to "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# 5. Tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]] if True else -100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 6. Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,  # Add id2label and label2id for correct metrics.
    label2id=label_to_id,
)

# 7. Training arguments
args = TrainingArguments(
    output_dir="./ner_bert_model",  # Changed output directory
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 8. Use seqeval metric
seqeval = evaluate.load("seqeval")


# 9. Define compute_metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# 10. Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Train
trainer.train()


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.83683,0.090909,0.2,0.125,0.333333
2,No log,1.709494,0.333333,0.2,0.25,0.666667
3,No log,1.645328,0.5,0.2,0.285714,0.714286


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3, training_loss=1.8981281916300456, metrics={'train_runtime': 77.421, 'train_samples_per_second': 0.116, 'train_steps_per_second': 0.039, 'total_flos': 45933562080.0, 'train_loss': 1.8981281916300456, 'epoch': 3.0})

In [28]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

# 1. Load data from the JSON file
json_file_path = "ner_dataset.json"  # Path to your JSON file
dataset = load_dataset("json", data_files=json_file_path)["train"]

# 2. Define the label list.  Crucially, get this from the dataset.
label_list = sorted(list(set(label for example in dataset for label in example["ner_tags"])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# 3. Tokenizer and model.  Using bert-base-cased here.
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 4. Tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]] if True else -100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 5. Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)

# 6. Training arguments
args = TrainingArguments(
    output_dir="./ner_bert_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 7. Use seqeval metric
seqeval = evaluate.load("seqeval")

# 8. Define compute_metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 9. Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 10. Train
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.933368,0.0,0.0,0.0,0.441176
2,No log,1.757539,0.0,0.0,0.0,0.676471
3,No log,1.684891,0.0,0.0,0.0,0.647059


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=2.0197776158650718, metrics={'train_runtime': 24.6398, 'train_samples_per_second': 0.609, 'train_steps_per_second': 0.244, 'total_flos': 75535873848.0, 'train_loss': 2.0197776158650718, 'epoch': 3.0})

In [34]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

# 1. Load data from the JSON file
json_file_path = "ner_performance_metrics_dataset.json"  # Path to your JSON file
dataset = load_dataset("json", data_files=json_file_path)["train"]

# 2. Define the label list.  Crucially, get this from the dataset.
label_list = sorted(list(set(label for example in dataset for label in example["ner_tags"])))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# 3. Tokenizer and model.  Using bert-base-cased here.
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 4. Tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]] if True else -100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)

# 5. Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)

# 6. Training arguments
args = TrainingArguments(
    output_dir="./ner_bert_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 7. Use seqeval metric
seqeval = evaluate.load("seqeval")

# 8. Define compute_metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 9. Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 10. Train
trainer.train()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.64274,0.394366,0.491228,0.4375,0.585714
2,No log,1.398331,0.790698,0.596491,0.68,0.757143
3,No log,1.310027,0.868421,0.578947,0.694737,0.757143


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=9, training_loss=1.66794925265842, metrics={'train_runtime': 42.778, 'train_samples_per_second': 0.701, 'train_steps_per_second': 0.21, 'total_flos': 274578144324.0, 'train_loss': 1.66794925265842, 'epoch': 3.0})

In [38]:
# 11. Inference function
def predict_ner(text, model, tokenizer, id_to_label):
    """
    Predicts NER tags for a given text using the fine-tuned model.

    Args:
        text (str): The input text to predict NER tags for.
        model (AutoModelForTokenClassification): The fine-tuned NER model.
        tokenizer (AutoTokenizer): The tokenizer used for the model.
        id_to_label (dict): A dictionary mapping label IDs to label names.

    Returns:
        list: A list of tuples, where each tuple contains a word and its predicted NER tag.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")  # Tokenize and return PyTorch tensors

    # Get the model's predictions
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)  # Get the predicted label IDs

    # Convert the predicted IDs to label names
    predicted_labels = [id_to_label[t.item()] for t in predictions[0]]  # Use .item()

    # Get the input tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # Use [0] to get the first (and only) sequence

    # Filter out special tokens (e.g., [CLS], [SEP]) and align labels
    word_tokens = []
    aligned_labels = []
    for i, token in enumerate(tokens):
        if token not in tokenizer.special_tokens_map.values():
            word_tokens.append(token)
            aligned_labels.append(predicted_labels[i])
    
    return list(zip(word_tokens, aligned_labels))



# 12. Example usage of the inference function
import torch
new_text = "Accuracy was not so good"
ner_predictions = predict_ner(new_text, model, tokenizer, id_to_label)
print(f"\nNER Predictions for: '{new_text}'")
for word, label in ner_predictions:
    print(f"{word}: {label}")


NER Predictions for: 'Accuracy was not so good'
accuracy: I-MODEL
was: O
not: B-METRIC
so: B-METRIC
good: O


In [37]:
label_list, ner_predictions

(['B-METRIC', 'B-MODEL', 'B-VALUE', 'I-METRIC', 'I-MODEL', 'I-VALUE', 'O'],
 [('accuracy', 'I-MODEL'),
  ('was', 'O'),
  ('not', 'B-METRIC'),
  ('so', 'B-METRIC'),
  ('good', 'O')])