In [343]:
import re
import nltk
import tinysegmenter
import glob
import json
import numpy as np
import pandas as pd
from nltk.collocations import *

In [260]:
def colloc_filter(text):
    '''If return True: collocation is removed
    If return False: collocation is not removed    
    '''
    if re.search("[^ぁ-んァ-ン一-龥ー]+", text):
        return True
    elif len(text)==1 and re.search("[ぁ-んァ-ン]", text):
        return True
    else:
        return False

In [261]:
list_recipe = []

all_files = glob.glob('data/japanese/*')
for file in all_files:
    with open(file,'r',encoding="utf-8") as f:
        contents = f.read()
        list_recipe += contents.split('\n')

list_list_tokens = []

for recipe in list_recipe:
    tokenized = tinysegmenter.tokenize(recipe)
    list_list_tokens.append(tokenized)

tokens = [token for list_tokens in list_list_tokens for token in list_tokens if not colloc_filter(token)]

In [262]:
token_dist = nltk.FreqDist(tokens)

In [269]:
list_keywords = [x[0] for x in token_dist.most_common(1500)]

In [270]:
list_keywords

['簡単',
 'サラダ',
 '焼き',
 '炒め',
 '味噌',
 'パスタ',
 '風',
 'トマト',
 '肉',
 '煮',
 'スープ',
 '野菜',
 '大根',
 '飯',
 '丼',
 '和風',
 'たっぷり',
 'キャベツ',
 '豚肉',
 '中華',
 '汁',
 'どん',
 '鍋',
 '炊き込み',
 '味',
 'カレー',
 '菜',
 '白菜',
 '醤油',
 '豆腐',
 'ツナ',
 '鮭',
 '美味しい',
 '鶏',
 '和え',
 '風味',
 '入り',
 'チーズ',
 '家',
 '揚げ',
 'ベーコン',
 '豚',
 '絶品',
 '卵',
 'アボカド',
 'はん',
 '生姜',
 '我',
 'レンジ',
 'そば',
 '草',
 'じゃ',
 '巻き',
 '鶏肉',
 'さっぱり',
 'ねぎ',
 'かぼちゃ',
 '海老',
 '分',
 'ピリ',
 '甘辛',
 'ほうれん',
 '寿司',
 '漬け',
 'さん',
 'バター',
 '茄子',
 'グラタン',
 'この',
 'ブロッコリー',
 '冷製',
 'もの',
 'ソース',
 '春',
 'ピーマン',
 'ヘルシー',
 'そうめん',
 'ない',
 '豚バラ',
 'ポテトサラダ',
 'だけ',
 '納豆',
 '人参',
 'にぎり',
 '牡蠣',
 'マヨ',
 '煮込',
 '豆乳',
 '煮物',
 'あんかけ',
 'なす',
 'イカ',
 '酢',
 '美味',
 'キムチ',
 '玉ねぎ',
 'たけ',
 '本格',
 'とろ',
 'フライパン',
 'チキン',
 '牛肉',
 'ポタージュ',
 '小松菜',
 '塩',
 '塩麹',
 'サーモン',
 'がい',
 '照り',
 '大葉',
 '鶏むね',
 'カルボナーラ',
 'ランチ',
 'ごぼう',
 'クリームパスタ',
 '超簡単',
 'あさり',
 '濃厚',
 '水菜',
 'アスパラ',
 'ハンバーグ',
 'しめじ',
 '柚子',
 'ペペロンチーノ',
 '作る',
 'もやし',
 'レタス',
 'オクラ',
 'エビ',
 'なし',
 'ニラ',
 '時短',
 'お弁当',
 

# Testing

In [392]:
use_dom_properties = False

In [401]:
with open('data/test/torikizoku.json','r',encoding = 'utf-8') as f:
    all_elems = json.load(f)

In [402]:
for elem in all_elems:
    tokenized = tinysegmenter.tokenize(elem['text'])
    # score = keyword count
    score = sum(token in list_keywords for token in tokenized)/len(tokenized)
    elem['score'] = score

In [403]:
# Find most frequently appearing (tag,parents,properties) pair for detected menu items
filtered_elems = [elem for elem in all_elems if elem['score']!=0 and len(elem['text'])<30]

In [405]:
# Get sum of score
if use_dom_properties:
    df_rule_score = pd.DataFrame({'rule':{'tag':elem['tag'],'parents':elem['parents'],'properties':elem['properties']}, 'score': elem['score']} for elem in filtered_elems)
else:
    df_rule_score = pd.DataFrame({'rule':{'tag':elem['tag'],'parents':elem['parents']}, 'score': elem['score']} for elem in filtered_elems)

list_unique_rules = df_rule_score['rule'].value_counts().index
max_idx = np.argmax([df_rule_score[df_rule_score['rule']==rule]['score'].sum() for rule in list_unique_rules])
best_rule = list_unique_rules[max_idx]

In [406]:
# Find menu items based on established rules
if use_dom_properties:
    menu_extract = [elem['text'] for elem in all_elems
                    if elem['tag']==best_rule['tag']
                    and elem['parents']==best_rule['parents']
                    and elem['properties']==best_rule['properties']]
else:
    menu_extract = [elem['text'] for elem in all_elems
                    if elem['tag']==best_rule['tag'] 
                    and elem['parents']==best_rule['parents']]

In [407]:
menu_extract

['トリキの唐揚',
 'ポテトフライ',
 'とり天',
 '北海道産和風ポテトさらだ',
 '粗挽ポークソーセージ串焼',
 'チキン南蛮',
 'カマンベールコロッケ',
 'ひざなんこつ唐揚',
 'ふんわり山芋の鉄板焼']

# Experimental

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
quadgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [None]:
len(list_recipe)

In [None]:
words = [word for recipe in list_recipe for word in recipe]

In [None]:
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(30)
for colloc in finder.above_score(bigram_measures.pmi,4):
    colloc_text = ''.join(colloc)
    if colloc_filter(colloc_text):
        continue
    else:
        print(colloc_text)

In [None]:
finder = TrigramCollocationFinder.from_words(words)
finder.apply_freq_filter(30)
for colloc in finder.above_score(trigram_measures.pmi,4):
    colloc_text = ''.join(colloc)
    if colloc_filter(colloc_text):
        continue
    else:
        print(colloc_text)

In [None]:
finder = QuadgramCollocationFinder.from_words(words)
finder.apply_freq_filter(30)
for colloc in finder.above_score(quadgram_measures.pmi,4):
    colloc_text = ''.join(colloc)
    if colloc_filter(colloc_text):
        continue
    else:
        print(colloc_text)