In [1]:
import spacy
import pandas as pd
from textblob import TextBlob
from transformers import pipeline
import numpy as np
from spacy.matcher import PhraseMatcher

  return torch._C._cuda_getDeviceCount() > 0
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../dataset/user-prompts.csv")
nlp = spacy.load('en_core_web_sm')

texts = df["text"].to_numpy()
species = df["species"].unique()
allergies = df["allergies"].unique()

In [3]:
doc = nlp(texts[0])

for token in doc:
    if token.pos_ == 'NOUN':
        print(token.text, '->', token.pos_)

spacy.displacy.render(doc, style='dep', jupyter=True)

guinea -> NOUN
pig -> NOUN
spinach -> NOUN


In [5]:
negative_words = ["hate", "dislike", "awful", "terrible", "bad", "not"]
positive_words = ["love", "like", "enjoy", "great", "awesome", "delicious"]

def is_negated(token):
    # Check for negation or sentiment-related negative words
    if token.dep_ == "neg" or token.lemma_.lower() in negative_words:
        return True
    return False

# Function to check if a word is positive sentiment
def is_positive_sentiment(token):
    if token.lemma_.lower() in positive_words:
        return True
    return False

# Function to extract food and apply sentiment rules
def extract_food_and_sentiment(text, labels, ps):
    doc = nlp(text)
    
    # Initialize PhraseMatcher and add patterns for food items
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp.make_doc(label) for label in labels]
    matcher.add(ps, None, *patterns)

    detected_foods = []

    # Apply the PhraseMatcher to find food phrases
    matches = matcher(doc)
    for match_id, start, end in matches:
        food_phrase = doc[start:end]
        sentiment = "neutral"  # Default assumption is neutral sentiment
        
        # Check for negation around the food phrase
        is_neg = False
        # Check if negation is before or after the food phrase
        if start > 0 and is_negated(doc[start - 1]):
            is_neg = True
        if end < len(doc) and is_negated(doc[end]):
            is_neg = True
        
        # Also, check for negation in surrounding words before and after the phrase
        context_range = 3  # Look at 3 words before and after the food phrase
        if any(is_negated(token) for token in doc[max(0, start-context_range):min(len(doc), end+context_range)]):
            is_neg = True
        
        if is_neg:
            sentiment = "negative"
        
        # Check for positive sentiment in the surrounding words
        if any(is_positive_sentiment(token) for token in doc[max(0, start-context_range):min(len(doc), end+context_range)]):
            sentiment = "positive"
        
        # Add the detected food phrase and its sentiment to the list
        detected_foods.append((food_phrase.text.lower(), sentiment))

    return detected_foods

# Test with an example sentence
text = texts[0]
print(text)
result = extract_food_and_sentiment(text, allergies, "ALLERGIES")
print(result)

My guinea pig cannot eat spinach.
[('spinach', 'negative')]


In [52]:
negative_words = ["hate", "dislike", "awful", "terrible", "bad", "not", "sensitive", "allergic", "allergies"]
positive_words = ["love", "like", "enjoy", "great", "awesome", "delicious"]

def is_negated(token, negative_words):
    return token.dep_ == "neg" or token.lemma_.lower() in negative_words

def is_positive_sentiment(token, positive_words):
    return token.lemma_.lower() in positive_words

# Main function to extract species first, then allergens, and their sentiment
def extract_species_and_allergens(text, allergens, valid_species, category_name="ALLERGENS"):
    doc = nlp(text)

    # Initialize the PhraseMatcher for allergens
    allergen_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    allergen_patterns = [nlp.make_doc(allergen) for allergen in allergens]
    allergen_matcher.add(category_name, None, *allergen_patterns)

    # Initialize the PhraseMatcher for species (valid species could be multi-word)
    species_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    species_patterns = [nlp.make_doc(species) for species in valid_species]
    species_matcher.add("SPECIES", None, *species_patterns)

    detected_species_allergens = []

    # Apply the species matcher to detect species
    species_matches = species_matcher(doc)

    species_positions = []
    for match_id, start, end in species_matches:
        species_name = doc[start:end].text.lower()
        species_positions.append((start, end, species_name, doc[start].idx, doc[end - 1].idx))

    allergen_matches = allergen_matcher(doc)

    allergen_sentiment_map = {}

    for allergen_match_id, allergen_start, allergen_end in allergen_matches:
        allergen_phrase = doc[allergen_start:allergen_end].text.lower()
        sentiment = "neutral"

        context_range = 3
        context = doc[max(0, allergen_start - context_range):min(len(doc), allergen_end + context_range)]
        is_neg = any(is_negated(token, negative_words) for token in context)
        is_pos = any(is_positive_sentiment(token, positive_words) for token in context)

        if is_neg:
            sentiment = "negative"
        elif is_pos:
            sentiment = "positive"

        allergen_sentiment_map[allergen_phrase] = sentiment

    for allergen_phrase, sentiment in allergen_sentiment_map.items():
        closest_species = None
        allergen_start_pos = None

        # Get the allergen start position
        for allergen_match_id, allergen_start, allergen_end in allergen_matches:
            if allergen_phrase == doc[allergen_start:allergen_end].text.lower():
                allergen_start_pos = doc[allergen_start].idx

        # Find the closest species before the allergen
        for species_start, species_end, species, species_start_idx, species_end_idx in species_positions:
            species_end_pos = species_end_idx  # Get the species' end position

            if species_end_pos < allergen_start_pos:  # Species appears before allergen
                closest_species = (species, species_start_idx, species_end_idx)
            else:
                break  # No need to check further species

        if closest_species:
            species_name, species_start_idx, species_end_idx = closest_species
            allergen_start = doc[allergen_start].idx
            allergen_end = doc[allergen_end - 1].idx
            detected_species_allergens.append((species_name, allergen_phrase, sentiment, species_start_idx, species_end_idx, allergen_start, allergen_end))

    return detected_species_allergens

text = np.random.choice(texts, 1)[0]
print(text)
allergens = allergies.tolist()
valid_species = species.tolist()

result = extract_species_and_allergens(text, allergens, valid_species)

print("Species with Allergens and Sentiment (with positions):")
for item in result:
    species_name, allergen, sentiment, species_start_idx, species_end_idx, allergen_start, allergen_end = item
    print(f"Species: {species_name}, Allergen: {allergen}, Sentiment: {sentiment}, "
          f"Species position: ({species_start_idx}, {species_end_idx}), "
          f"Allergen position: ({allergen_start}, {allergen_end})")


My guinea pig cannot eat milk.
Species with Allergens and Sentiment (with positions):
Species: guinea pig, Allergen: milk, Sentiment: negative, Species position: (3, 10), Allergen position: (25, 25)


In [1]:
valid_species, allergens

NameError: name 'valid_species' is not defined