In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy
import subprocess
from nltk.stem import WordNetLemmatizer
import json
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt_tab')

In [2]:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("downloading...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

In [3]:
file_path = '/Users/stephenzhang/Downloads/yelp.csv'
df = pd.read_csv(file_path)

# Extract dish names

In [4]:
def get_action_verbs(df, col_name, top_n_verbs=30):
    text_data_list = df[col_name].tolist()
    verbs = []
    for doc in nlp.pipe(text_data_list):
        for token in doc:
            if token.pos_ == "VERB" and token.dep_ in ("ROOT", "dobj"):
                verbs.append(token.lemma_)
        action_verbs = set([v for v, cnt in Counter(verbs).most_common(top_n_verbs)])
        
    return action_verbs

In [5]:

def get_menu_items(df, col_name, top_n_menu_items=100):
    text_data_list = df[col_name].tolist()
    menu_candidates = []
    for doc in nlp.pipe(text_data_list):
        for chunk in doc.noun_chunks:
            if len(chunk.text.split()) >= 2:  # 提取复合名词
                menu_candidates.append(chunk.text.lower())
    menu_items = set([item for item, cnt in Counter(menu_candidates).most_common(100)])
    return menu_items


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [7]:
df["text_clean"] = df["text"].apply(clean_text)

In [8]:
ACTION_VERBS = get_action_verbs(df, 'text_clean', 20)
menu_items = get_action_verbs(df, 'text_clean', 20)

In [9]:
lemmatizer = WordNetLemmatizer()
def lemmatize_dish(dish):
    doc = nlp(dish)
    return " ".join([token.lemma_ for token in doc])

In [10]:
def extract_dishes_spacy(text):
    doc = nlp(text)
    dishes = []
    
    # 规则1：通过动词的直接宾语提取
    for token in doc:
        if token.text.lower() in ACTION_VERBS and token.dep_ == "ROOT":
            for child in token.children:
                if child.dep_ == "dobj":  # 直接宾语
                    dish = " ".join([w.lemma_ for w in child.subtree])
                    dishes.append(dish)
    
    # 规则2：名词短语（过滤短短语）
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) >= 2:
            dish = " ".join([w.lemma_ for w in chunk])
            dishes.append(dish)
    
    return list(set(dishes))

In [11]:
def filter_dishes_spacy(texts, min_freq=2):
    all_dishes = []
    for text in texts:
        dishes = extract_dishes_spacy(text)
        # 过滤停用词和通用词
        stop_words = {"food", "service", "restaurant", "place"}
        filtered = [
            dish for dish in dishes
            if not any(word in dish.split() for word in stop_words)
        ]
        all_dishes.extend(filtered)
    
    # 统计高频词
    counter = Counter(all_dishes)
    return [dish for dish, freq in counter.items() if freq >= min_freq]

# Execute

In [None]:
def apply_spacy_to_df(df, filter_validate_menue=False):    
    # 提取菜品并词形还原
    df["dishes_raw"] = df["text_clean"].apply(extract_dishes_spacy)
    df["dishes_lemmatized"] = df["dishes_raw"].apply(
        lambda dishes: [lemmatize_dish(dish) for dish in dishes]
    )
    if filter_validate_menue:
        df["valid_dishes"] = df["dishes_lemmatized"].apply(
            lambda dishes: [dish for dish in dishes if dish in menu_items]
        )
    
    return df

In [15]:

df_grouped = df.groupby('business_id').apply(apply_spacy_to_df)

  df_grouped = df.groupby('business_id').apply(apply_spacy_to_df)


In [16]:
df_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,text_clean,dishes_raw,dishes_lemmatized,valid_dishes
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
-0QBrNvhrPQCaeo7mTo0zQ,6256,-0QBrNvhrPQCaeo7mTo0zQ,2011-06-26,dx-uyV1hWZ8jQ5szxqeorg,4,Best shrimp burro's. If it wasn't in the hood ...,review,DghMl81bONCh5ELnrgn4Fw,0,0,0,best shrimp burros if it wasnt in the hood ma...,"[a couple tv, your mind, the place, the staff,...","[a couple tv, your mind, the place, the staff,...",[]
-0bUDim5OGuv8R0Qqq6J4A,240,-0bUDim5OGuv8R0Qqq6J4A,2012-01-13,ex4pODOWrfzx1k89FEE0Kg,3,"This place is busy, but the kids love the panc...",review,okcNd96gHHf_83wly93bpQ,0,0,1,this place is busy but the kids love the panca...,"[so much food, the pancake, the kid, the price...","[so much food, the pancake, the kid, the price...",[]
-1N0Z3uM8xbxKS8XiAnaog,3469,-1N0Z3uM8xbxKS8XiAnaog,2009-04-20,IvTP2fHcGOG_GkwLOhqh1g,1,"""The office space, layout, presentation gets 5...",review,IzMeF5f2043jgDjhzNeDbg,3,3,3,the office space layout presentation gets 5 st...,"[the office space layout presentation, their o...","[the office space layout presentation, their o...",[]
-34jE_5dujSWMIOBudQsiQ,6795,-34jE_5dujSWMIOBudQsiQ,2009-11-02,Ydc1XwgrFaEWKo0jR-hp1w,5,"Dr. Dairiki is terrific. She is professional, ...",review,Rr1q5BfuV9u6sJX98aOgCw,0,2,0,dr dairiki is terrific she is professional lis...,"[test result, professional listen, exactly wha...","[test result, professional listen, exactly wha...",[]
-3WVw1TNQbPBzaKCaQQ1AQ,657,-3WVw1TNQbPBzaKCaQQ1AQ,2007-01-20,5vaFU2g9t88ge4Fm_JnM2A,3,Had lunch here today after hearing all the col...,review,htC49ZwXiKNka5cp0GKBfQ,2,4,1,had lunch here today after hearing all the col...,"[the hot sour soup, pizzeria bianco, the pla...","[the hot sour soup, pizzeria bianco, the pl...",[]
