In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy
import subprocess
from nltk.stem import WordNetLemmatizer
import json
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt_tab')

In [2]:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("downloading...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

In [3]:
file_path = '/Users/stephenzhang/Downloads/yelp.csv'
df = pd.read_csv(file_path)

# Extract dish names

In [4]:
def get_action_verbs(df, col_name, top_n_verbs=30):
    text_data_list = df[col_name].tolist()
    verbs = []
    for doc in nlp.pipe(text_data_list):
        for token in doc:
            if token.pos_ == "VERB" and token.dep_ in ("ROOT", "dobj"):
                verbs.append(token.lemma_)
        action_verbs = set([v for v, cnt in Counter(verbs).most_common(top_n_verbs)])
        
    return action_verbs

In [5]:

def get_menu_items(df, col_name, top_n_menu_items=100):
    text_data_list = df[col_name].tolist()
    menu_candidates = []
    for doc in nlp.pipe(text_data_list):
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) >= 2:  # 提取复合名词
                menu_candidates.append(chunk.text.lower())
    menu_items = set([item for item, cnt in Counter(menu_candidates).most_common(100)])
    return menu_items


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [7]:
df["text_clean"] = df["text"].apply(clean_text)

In [8]:
ACTION_VERBS = get_action_verbs(df, 'text_clean', 20)
menu_items = get_action_verbs(df, 'text_clean', 20)

In [9]:
lemmatizer = WordNetLemmatizer()
def lemmatize_dish(dish):
    doc = nlp(dish)
    return " ".join([token.lemma_ for token in doc])

In [10]:
def extract_dishes_spacy(text):
    doc = nlp(text)
    dishes = []
    
    # 规则1：通过动词的直接宾语提取
    for token in doc:
        if token.text.lower() in ACTION_VERBS and token.dep_ == "ROOT":
            for child in token.children:
                if child.dep_ == "dobj":  # 直接宾语
                    dish = " ".join([w.lemma_ for w in child.subtree])
                    dishes.append(dish)
    
    # 规则2：名词短语（过滤短短语）
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) >= 2:
            dish = " ".join([w.lemma_ for w in chunk])
            dishes.append(dish)
    
    return list(set(dishes))

In [11]:
def filter_dishes_spacy(texts, min_freq=2):
    all_dishes = []
    for text in texts:
        dishes = extract_dishes_spacy(text)
        # 过滤停用词和通用词
        stop_words = {"food", "service", "restaurant", "place"}
        filtered = [
            dish for dish in dishes
            if not any(word in dish.split() for word in stop_words)
        ]
        all_dishes.extend(filtered)
    
    # 统计高频词
    counter = Counter(all_dishes)
    return [dish for dish, freq in counter.items() if freq >= min_freq]

# Execute

In [None]:
def apply_spacy_to_df(df, filter_validate_menue=False):    
    # 提取菜品并词形还原
    df["dishes_raw"] = df["text_clean"].apply(extract_dishes_spacy)
    df["dishes_lemmatized"] = df["dishes_raw"].apply(
        lambda dishes: [lemmatize_dish(dish) for dish in dishes]
    )
    if filter_validate_menue:
        df["valid_dishes"] = df["dishes_lemmatized"].apply(
            lambda dishes: [dish for dish in dishes if dish in menu_items]
        )
    
    return df

In [13]:

df = apply_spacy_to_df(df)

In [14]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,text_clean,dishes_raw,dishes_lemmatized,valid_dishes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0,my wife took me here on my birthday for breakf...,"[a favor, the place, an absolute pleasure, our...","[a favor, the place, an absolute pleasure, our...",[]
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,i have no idea why some people give bad review...,"[all these bad reviewer, the beef pizza, many ...","[all these bad reviewer, the beef pizza, many ...",[]
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0,love the gyro plate rice is so good and i also...,"[their candy selection, the gyro plate rice]","[their candy selection, the gyro plate rice]",[]
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0,rosie dakota and i love chaparral dog park its...,"[trash can, the dog, a lake, the park, a wonde...","[trash can, the dog, a lake, the park, a wonde...",[]
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,general manager scott petello is a good egg no...,"[his awesome staff, a customer, your case, a g...","[his awesome staff, a customer, your case, a g...",[]
