In [1]:
import spacy
import pandas as pd
import json
from spacy.matcher import PhraseMatcher
from datasets import Dataset

# Load the SpaCy model
nlp = spacy.load("en_core_web_trf")

# Load dataset from a .csv file
data = pd.read_csv("animals_sentences_full.csv")

# Convert the dataset into a pandas DataFrame
df = pd.DataFrame(data)
df = df[['text']]  # Keep only the 'text' column

# Define a set of animal classes to identify in the text
animal_classes = {'beaver', 'dolphin', 'otter', 'seal', 'fox', 'spider', 'elephant', 'bear', 'rabbit', 'tiger'}

# Initialize the PhraseMatcher for custom entity detection
matcher = PhraseMatcher(nlp.vocab)

# Lemmatize animal names to improve matching
patterns = []
for animal in animal_classes:
    doc = nlp(animal.lower())  # Create a Doc object and lemmatize the animal name
    lemmas = [token.lemma_ for token in doc]  # Extract lemmas from tokens
    pattern = " ".join(lemmas)  # Join lemmas into a string
    patterns.append(nlp.make_doc(pattern))  # Convert into a SpaCy Doc object

# Add the lemmatized animal phrases to PhraseMatcher
matcher.add("ANIMALS", None, *patterns)

# Function to extract and label animal entities in the text
def extract_and_label_entities(text):
    global x
    doc = nlp(text.lower())  # Convert text to lowercase for better lemmatization
    
    labels = []  # List to store entity labels
    
    matches = matcher(doc)  # Apply PhraseMatcher
       
    # Iterate over all tokens and assign entity labels
    for i, token in enumerate(doc):
        matched = False
        for match_id, start, end in matches:
            if start <= i < end:
                if i == start:
                    labels.append((token.text, "B-ANIMAL"))  # Beginning of an entity
                else:
                    labels.append((token.text, "I-ANIMAL"))  # Inside an entity
                matched = True
                break
        if not matched:
            labels.append((token.text, "O"))  # Not an entity
    print(x)
    
    # Convert labels to a format suitable for model training
    labeled_entities = [(token, label) for token, label in labels]
    
    return labeled_entities

print(len(df["text"]))

# Apply the entity extraction function to the text column
df["entities"] = df["text"].apply(extract_and_label_entities)

# Save the annotated data to a CSV file
df.to_csv("annotated_animals_with_labels_fix.csv", index=False, encoding="utf-8")

print("âœ… Annotation completed! File saved as 'annotated_animals_with_labels_fix.csv'.")

# Function to tokenize text and assign corresponding entity labels
def create_tokenized_data(text, entities):
    words = text.split()  # Split the text into words
    labels = ['O'] * len(words)  # Initialize all labels as 'O' (non-entity)
    
    # Assign B/I entity labels
    for entity in entities:
        word, label = entity
        if word in words:
            word_idx = words.index(word)
            labels[word_idx] = label
    
    return {'tokens': words, 'labels': labels}

# Apply tokenization and labeling to each row
train_data = [create_tokenized_data(row['text'], row['entities']) for _, row in df.iterrows()]

# Convert the processed data into a Hugging Face Dataset
dataset = Dataset.from_list(train_data)

# Convert dataset into a DataFrame for verification
df_from_dataset = pd.DataFrame(dataset)

# Save the dataset as a CSV file
df_from_dataset.to_csv('./dataset.csv', index=False)

# Display the first entry of the dataset
dataset[0]





# Extract all labels from the dataset
all_labels = []
for i in range(len(dataset)):
    all_labels.extend(dataset[i]['labels'])

# Get unique labels
unique_labels = set(all_labels)

# Print unique labels and their count
print("Unique labels:", unique_labels)
print("Number of unique labels:", len(unique_labels))

# Check if the number of unique labels is exactly 3
if len(unique_labels) == 3:
    print("You have exactly 3 unique labels.")
else:
    print("Warning! The number of unique labels is not 3.")

KeyboardInterrupt: 