In [1]:
"""
準備
"""
import pandas as pd
import MeCab
from tqdm import tqdm as tq

import re
clean_text = re.compile('[!"#$＄%&\'\\\\()*+,-./:：;<=>?@[\\]^_`{|}~「」｣〔〕＜＞～“”〈〉『』【】＆＊✴・（）↑＄＃＠Ⓣ。、？…！｀＋￥％⚫︎⭕〇◯△◇☆✩☆✧･　※ ◎○▼▲■◆♦●★♡┗└┌✿ё◉♪♫♬]')

In [5]:
"""
rakutenのjsonデータから学習データの生成
"""
import glob
import json
import MeCab

files = glob.glob("../data/train_data/rakuten_json/*.json")

rakuten_scray_recipes = []

for file_url in files:
  json_open = open(file_url, 'r')
  tmp = json.load(json_open)
  rakuten_scray_recipes += tmp

print(len(rakuten_scray_recipes))

rakuten_train_text = ""
m = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd -Owakati")

for recipe in tq(rakuten_scray_recipes,total=len(rakuten_scray_recipes)):
  tmp = ""
  tmp += recipe["title"].replace(" \n","").replace("レシピ・作り方","") + "\n"
  tmp += recipe["comment"].replace(" \n","") + "\n"
  tmp += "\n".join(recipe["step"]) + "\n"
  tmp += recipe["sub_comment"].replace(" \n","") + "\n"
  tmp += recipe["hint"].replace(" \n","") + "\n"
  tmp = re.sub(clean_text,"",tmp)
  rakuten_train_text += m.parse(tmp)

print(len(rakuten_train_text))

file = open("../data/train_data/rakuten_scray_train.txt",'w',encoding="utf-8")
file.write(rakuten_train_text)
file.close()

1380094


100%|██████████| 1380094/1380094 [04:25<00:00, 5197.89it/s]


445367645


In [2]:
"""
学習データの作成
"""
rakuten_recipes = pd.read_table("../data/train_data/recipe03_process_20160112.txt",names=["id","position","step"])
rakuten_recipes = rakuten_recipes.dropna(subset=["step"])
cleaned_data = rakuten_recipes["step"].apply(lambda data:re.sub(clean_text,"",data))
cleaned_data = pd.DataFrame(cleaned_data)
cleaned_data.head()

Unnamed: 0,step
0,栗を圧力鍋で８分蒸し半分に切って中身をスプーンで取り出す
1,厚手でぴっちり蓋の閉まるn鍋に移して水を加え沸騰したら弱火で２３分煮て火を止めて蓋をし３０分蒸らす
2,ポテトマッシャーでつぶし砂糖塩を加える
3,弱火にかけ鍋底に膜が付いたら火を止め少し生地で膜をふやかしてから剥がすという作業を繰り返して...
4,栗の色が白っぽくなったら味を決めてさらに少し練り鍋肌に生地を付けて乾燥させながら冷ます


In [3]:
cleaned_data["str_len"] = cleaned_data["step"].apply(lambda data:len(data))
output = cleaned_data[cleaned_data["str_len"] != 0]
output.describe()

Unnamed: 0,str_len
count,3031711.0
mean,36.17919
std,401.2472
min,1.0
25%,22.0
50%,33.0
75%,47.0
max,697976.0


In [4]:
# 合計語彙数
num = 0
# データの総文字数
str_len = output["str_len"].sum()
# 学習データ
corpas = ""

m = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd -Owakati")

for step in tq(output["step"],total=len(output)):
    wakati = m.parse(step)
    num += len(wakati.split(' '))
    corpas += wakati

with open("../data/train_data/rakuten_step_corpas.txt",'w') as out_corpas:
    out_corpas.write(corpas)

print("平均語彙サイズ:",str_len/num)
print("平均語彙数:",num/len(output))

100%|██████████| 3031711/3031711 [01:07<00:00, 44658.76it/s]


平均語彙サイズ: 1.8180813600902384
平均語彙数: 19.899653034210715


In [None]:
from gensim.models import word2vec
import logging
sentences = word2vec.Text8Corpus('../data/train_data/rakuten_scray_train.txt')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, epochs=5,vector_size=600,sg=1,min_count=5, window=4, workers=8,hs=0)
model.wv.save_word2vec_format("../data/trained_data/rakuten_m1_v600_min5_w4.vec.pt", binary=True)