In [2]:
"""
食材を読み込んで簡単な前処理をする。

"""
import pandas as pd
import MeCab
from tqdm import tqdm as progress

# 食材名データの読み込み
data = pd.read_csv("../data/fixed_recommend_ingredients.csv")
# 変換表の読み込み
exchange = pd.read_csv("../data/exchange_before.csv",names=["id","name","plus","unit","g"])
# カナに統一した変換表の読み込み
exchange_kana = pd.read_csv("../data/exchanged_map.csv")

# 必要な情報のみ抽出
exchange_kana = exchange_kana[["id","name"]]
data = data[["id","name"]]

data = data.head(1000)

m = MeCab.Tagger("-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")

#　空欄のデータは削除
print("生データの総数",len(data))
data = data.dropna(subset=["name"])
print("削除後",len(data))
data["result"] = "empty"

  data = pd.read_csv("../data/fixed_recommend_ingredients.csv")


生データの総数 1000
削除後 1000


In [4]:
"""
食材名から名詞部分を抜き出す関数
名詞が一つの場合はその読みを返す。
入力が文字列でない場合はNoneを返す
複数の名詞がある場合には区切り文字を;にして連結する
名詞ではあるが、,を区切り文字とした場合に辞書の8番目に読みがない場合は
先頭に?をつけて返している。
名詞がない場合は引数の先頭に#を付けて返す
"""

def pick_ingredients(words,m):
  # 分かち書きの正常結果を格納
  tmp_ingredients = []

  # 値がNoneのものを除外
  if type(words) != str:
    return None
  
  parsed_text = m.parseToNode(words)
  while parsed_text:
    node = parsed_text.feature.split(',')
    # 正常な名詞のみ抽出
    if node[0] == "名詞" and node[2] !="組織":
      try:
        tmp_ingredients.append(node[6])
      except IndexError:
        tmp_ingredients.append( "?" + parsed_text.surface)
    else:
      pass
    parsed_text = parsed_text.next
  
  length = len(tmp_ingredients)
  if length == 1:
    return tmp_ingredients[0]
  elif length >= 2:
    return ';'.join(tmp_ingredients)
  else:
    return '#'+words


# 進捗を確認
progress.pandas()
# 関数を適応
data['wakati'] = data['name'].progress_apply(pick_ingredients,m=m)

100%|██████████| 1000/1000 [00:00<00:00, 52735.99it/s]


In [7]:
"""
step1,2
"""

def exchange_map_ingre(ingredients,exchange):
  # 食材名がない場合
  if ingredients == None:
    return ingredients
  elif '#' in ingredients:
    return ingredients

  ingre = pd.DataFrame([])

  # 複数の食材名候補がある場合
  if ";" in ingredients:
    tmp_ingredients = ingredients.split(";")
    for tmp in tmp_ingredients:
      if "?" in tmp:
        pass
      else:
        # バグを発見
        match_ingre = exchange[exchange["name"] == tmp]
        match_ingre = match_ingre[~match_ingre.duplicated(subset='name')]
        ingre = pd.concat([ingre,match_ingre])
  else:
    # 食材名が一つの時
    if "?" in ingredients:
      return ingredients
    ingre = exchange[exchange["name"] == ingredients]
    ingre = ingre[~ingre.duplicated(subset='name')]
  
  # マッチしてないときのサポート
  if ingre.empty:
    return "empty"

  return ';'.join(map(str,ingre["id"].tolist()))

In [None]:
"""
step1
"""
step1 = data
step1["result"] = data["wakati"].progress_apply(exchange_map_ingre, exchange=exchange)
print(len(step1[~step1["result"].str.contains("empty")]))
step1[step1["result"].str.contains("empty")].head(50)

In [8]:
step2 = step1[step1["result"].str.contains("empty")]
step2["result"] =  step2["name"].progress_apply(exchange_map_ingre, exchange=exchange_kana)
print(len(step2[~step2["result"].str.contains("empty")]))
step2[step2["result"].str.contains("empty")].head(50)

100%|██████████| 359/359 [00:00<00:00, 926.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  step2["result"] =  step2["name"].progress_apply(exchange_map_ingre, exchange=exchange_kana)


25


Unnamed: 0,id,name,result,wakati
3,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚肉,empty,豚肉
10,4afce5687dc173ad4fef943b686582a1cd06e264,豚肉,empty,豚肉
11,4afce5687dc173ad4fef943b686582a1cd06e264,おろしだれ,empty,だれ
15,030833ed4e8dab3aa1e9d75edc1681efb368434f,チキンブイヨン,empty,チキンブイヨン
17,12d462eae4db630ea5280380ebc99da3318080ae,よもぎ,empty,よもぎ
20,12d462eae4db630ea5280380ebc99da3318080ae,熱湯,empty,熱湯
23,d05671227be64db32f383ba61af225c354b0167f,熱湯,empty,熱湯
26,10ba7960f7fbb088ec7f3b79467a2db269662f4c,グランマニエ,empty,グランマニエ
34,8f95726c9cf6c0fc23b38b8a858b0318d631b770,サラダオイル,empty,サラダオイル
36,8f95726c9cf6c0fc23b38b8a858b0318d631b770,ドライイースト,empty,ドライイースト


In [9]:
"""
step3
変換先をidじゃなくて食材名に変更
"""

def exchange_map_ingre(ingredients,exchange):
  # 食材名がない場合
  if ingredients == None:
    return ingredients
  elif '#' in ingredients:
    return ingredients

  ingre = pd.DataFrame([])

  # 複数の食材名候補がある場合
  if ";" in ingredients:
    tmp_ingredients = ingredients.split(";")
    for tmp in tmp_ingredients:
      if "?" in tmp:
        pass
      else:
        # バグを発見
        match_ingre = exchange[exchange["name"] == tmp]
        match_ingre = match_ingre[~match_ingre.duplicated(subset='name')]
        ingre = pd.concat([ingre,match_ingre])
  else:
    # 食材名が一つの時
    if "?" in ingredients:
      return ingredients
    ingre = exchange[exchange["name"] == ingredients]
    ingre = ingre[~ingre.duplicated(subset='name')]
  
  # マッチしてないときのサポート
  if ingre.empty:
    return "empty"
  
  # step2との変更箇所
  return ';'.join(map(str,ingre["result"].tolist()))

# 正解データセットをインポート
correct_data = pd.read_excel("../data/correct_data_set.xlsx")

In [10]:
step3 = step2[step2["result"].str.contains("empty")]
step3["result"] = step3["wakati"].progress_apply(exchange_map_ingre, exchange=correct_data)
print(len(step3[~step3["result"].str.contains("empty")]))
step3[step3["result"].str.contains("empty")].head(50)

100%|██████████| 334/334 [00:00<00:00, 553.03it/s]

186



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  step3["result"] = step3["wakati"].progress_apply(exchange_map_ingre, exchange=correct_data)


Unnamed: 0,id,name,result,wakati
11,4afce5687dc173ad4fef943b686582a1cd06e264,おろしだれ,empty,だれ
17,12d462eae4db630ea5280380ebc99da3318080ae,よもぎ,empty,よもぎ
20,12d462eae4db630ea5280380ebc99da3318080ae,熱湯,empty,熱湯
23,d05671227be64db32f383ba61af225c354b0167f,熱湯,empty,熱湯
26,10ba7960f7fbb088ec7f3b79467a2db269662f4c,グランマニエ,empty,グランマニエ
40,4e4f3b5b64b915b67c7deca5dd182a01b0d89f9f,小豆,empty,小豆
43,4e4f3b5b64b915b67c7deca5dd182a01b0d89f9f,布巾,empty,布巾
63,ed4f81e0a05c77c078bd0b6a1c687f41452f2116,だし汁,empty,だし汁
71,bb5e7bee5031d7640e236a6fd3e11ed88a181f29,だし汁,empty,だし汁
83,71f57f9b806fa0ed849be7178769a27479aa4fbe,ごまあぶら,empty,まあ;ブラ
