In [1]:
import numpy as np
import pandas as pd
import pickle
from gensim.models import word2vec
import MeCab
import re


tagger = MeCab.Tagger(" -r /etc/mecabrc -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
# おかz
# tagger = MeCab.Tagger("-Owakati -d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd")

def tokenize_ja(text, lower):
    node = tagger.parseToNode(str(text))
    while node:
        # 分かち書きで取得する品詞を指定
        if lower and node.feature.split(',')[0] in ["名詞", "形容詞"]:
            yield node.surface.lower()
        node = node.next


def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]


# 学習データの読み込み
df = pd.read_csv('./data/ramen_review.csv')
df2 = pd.read_csv('./data/ramen_store.csv')
df = pd.merge(df, df2, on="ID")
df_ramen = df.groupby(['ID', 'name', 'score', 'review_count'])['review'].apply(
    list).apply(' '.join).reset_index().sort_values('ID')



In [2]:
df_ramen[['name', 'review']].to_csv('confirm.csv')

In [3]:
# コーパス作成
def preprocessing_text(text):
    # 改行、半角スペース、全角スペースを削除
    text = re.sub('\r', '' , text)
    text = re.sub('\n', '' , text)
    text = re.sub('　', '' , text)
    text = re.sub(' ', '' , text)
    text = text.lower()
    # 数字文字の一律「0」化
    text = re.sub('\d+,?\d*', '0', text) 
    return text
                  
wakati_ramen_text = []
for i in df_ramen['review']:
    txt = preprocessing_text(i)
    txt = tokenize(txt, 2, 10000, True)
    wakati_ramen_text.append(txt)

np.savetxt("./data/ramen_corpus.txt", wakati_ramen_text,
           fmt='%s', delimiter=',', encoding='UTF-8')
np.savetxt("./data/ramen_corpus.txt", wakati_ramen_text,
           fmt='%s', delimiter=',', encoding='UTF-8')


  return array(a, dtype, copy=False, order=order)


In [4]:
# モデル作成

word2vec_ramen_model = word2vec.Word2Vec(wakati_ramen_text,sg=1, vector_size=100, window=5,min_count=5,workers=3)

# word2vec_ramen_model = word2vec.Word2Vec(wakati_ramen_text, sg = 1, vector_size=100, window = 5, min_count = 5, workers = 3)

#sg（0: CBOW, 1: skip-gram）,size（ベクトルの次元数）,window（学習に使う前後の単語数）,min_count（n回未満登場する単語を破棄）,iter（トレーニング反復回数）

# モデルのセーブ
word2vec_ramen_model.save("./model/word2vec_ramen_model.model")

In [5]:
# モデルのロード
#model(ver1)
word2vec_ramen_model =word2vec.Word2Vec.load("./model/word2vec_ramen_model.model")
word2vec_ramen_model.wv.most_similar("二郎")

[('ラーメン二郎', 0.8203260898590088),
 ('上野毛', 0.7678502202033997),
 ('野猿', 0.751919150352478),
 ('ヤサイ', 0.7494155764579773),
 ('小岩', 0.7356836795806885),
 ('野猿街道', 0.7286577820777893),
 ('千住大橋', 0.7243333458900452),
 ('二郎系', 0.7242286801338196),
 ('デロ', 0.7236515879631042),
 ('乳化', 0.7102108597755432)]

In [6]:
# モデルのロード
#model(ver2)
word2vec_ramen_model =word2vec.Word2Vec.load("./model/word2vec_ramen_model_v2.model")

In [6]:
word2vec_ramen_model.wv.most_similar("二郎")

[('ラーメン二郎', 0.8167045712471008),
 ('ヤサイ', 0.7779735922813416),
 ('直系', 0.7680529356002808),
 ('野猿', 0.761073887348175),
 ('千住大橋', 0.754515528678894),
 ('桜台', 0.7495788931846619),
 ('上野毛', 0.7495309710502625),
 ('三田', 0.7449679374694824),
 ('小岩', 0.7415353655815125),
 ('二郎系', 0.7391459345817566)]

In [7]:
word2vec_ramen_model.wv.most_similar("山岸")

[('松本', 0.9421358108520508),
 ('ミュージシャン', 0.9397774338722229),
 ('泉谷しげる', 0.9395560026168823),
 ('北海道出身', 0.9388940334320068),
 ('小川', 0.9361956119537354),
 ('郡山', 0.9358366131782532),
 ('孫弟子', 0.9341264963150024),
 ('初代', 0.9333155751228333),
 ('小野里', 0.9324114322662354),
 ('鵠沼海岸', 0.9294072389602661)]

In [8]:
word2vec_ramen_model.wv.most_similar(positive=[u"ラーメン",u"北海道"])

[('龍上海', 0.822999894618988),
 ('筑後', 0.8091440796852112),
 ('旭川', 0.8010036945343018),
 ('和歌山ラーメン', 0.8008827567100525),
 ('頂点', 0.7997747659683228),
 ('確立', 0.7990833520889282),
 ('久しく', 0.7964422702789307),
 ('育ち', 0.7900978922843933),
 ('order', 0.789796769618988),
 ('なんつッ亭', 0.7885482311248779)]