In [1]:
#Mecabのインストール
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcwidget4 libsigc++-2.0-0v5 libxapian30
Suggested packages:
  apt-xapian-index aptitude-doc-en | aptitude-doc debtags tasksel libcwidget-dev xapian-tools
The following NEW packages will be installed:
  aptitude aptitude-common libcwidget4 libsigc++-2.0-0v5 libxapian30
0 upgraded, 5 newly installed, 0 to remove and 45 not upgraded.
Need to get 3,838 kB of archives.
After this operation, 17.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 aptitude-common all 0.8.13-3ubuntu1 [1,719 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsigc++-2.0-0v5 amd64 2.10.4-2ubuntu3 [12.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcwidget4 amd64 0.5.18-5build1 [306 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libxapian30 amd64 1

In [2]:
import MeCab
import math
import glob
import os
import re
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pickle

In [3]:
tagger = MeCab.Tagger("/var/lib/mecab/dic/mecab-ipadic-neologd")

In [4]:
word_dictionary = {"_id":0}
dt = {}
files = []

In [5]:
#テキストから名詞、動詞、形容詞以外を除外する
def selectNVA(text):
  result = []
  text = re.sub(r"[,!?:;'（）]","",text)
  words = tagger.parse(text).split("\n")
  for i in range(len(words)-2):
    line = words[i].split(',')
    speech = line[0].split("\t")[1]
    if speech in ['名詞']:
      word = line[0].split("\t")[0]
    elif speech in ['動詞','形容詞']:
      word = line[6]
    else:
      continue
    result.append(word)
  return result

In [6]:
#単語を単語idに変換して返す
def words_to_id(words,add=True):
  result = []
  for word in words:
    if word in word_dictionary:
      result.append(word_dictionary[word])
      continue
    elif add:
      id = word_dictionary[word] = word_dictionary["_id"]
      word_dictionary["_id"] += 1
      result.append(id)
  return result

In [7]:
def add_to_files(text):
  ids = words_to_id(selectNVA(text))
  files.append(ids)

In [8]:
#tfを計算
def calculate_tf(t,d):
  return d.count(t) / len(d)

In [9]:
#dtを計算
def calculate_dt(t):
  return dt[t]

In [None]:
#すべての単語のdtを保存
def make_dt():
  for i in range(len(word_dictionary)):
    #print(i)
    dt[i] = 0
    for file in files:
      if i in file:
        dt[i] += 1

In [10]:
#idfを計算
def calculate_idf(t,D):
  return math.log(D / calculate_dt(t))

In [11]:
#tf-idfを計算
def calculate_tf_idf():
  result = []
  D = len(files)
  count = 0
  for d in files:
    count+= 1
    d_result = []
    for i in range(word_dictionary["_id"]):
      tf = calculate_tf(i,d)
      idf = calculate_idf(i,D)
      tf_idf = tf * idf
      d_result.append(tf_idf)
    result.append(d_result)
  return result

In [12]:
#一つのファイルを読み込む
def read_file(path):
  with open(path,"r",encoding="utf-8") as f:
    text = f.read()
    add_to_files(text)

In [13]:
#辞書データを保存
def save_dictionary(file_name):
  pickle.dump([word_dictionary,dt,files],open(file_name,"wb"))

In [14]:
#保存したディクショナリをロード
def load_dictionary(filename):
  word_dictionary,dt,files = pickle.load(open(filename,"rb"))
  return word_dictionary,dt,files

# ここからはデータを一から作成する場合にのみ実行(labels.npy,dictionary.npyを利用しない場合)
* 収集済のデータを利用する場合は飛ばして可

In [46]:
#パスのprefecturesはWikipediaから収集したデータを格納したディレクトリ
LABELS = []
datas,labels = [],[]
def read_files(path, label):
    #print("read_files=", path)
    files = glob.glob(path + "/*.txt")
    for f in files:
        read_file(f)
        labels.append(label)

def read_all_files(path):
  count_label = 0
  dir = glob.glob(path + "/*")
  for d in dir:
    print(d)
    LABELS.append(d.replace('./prefectures/',''))
    read_files(d,count_label)
    count_label += 1
  return count_label

nb_classes = read_all_files('./prefectures')

/content/drive/MyDrive/programming/python/cross_word/prefectures/トウキヨウ
/content/drive/MyDrive/programming/python/cross_word/prefectures/サイタマ
/content/drive/MyDrive/programming/python/cross_word/prefectures/カナガワ
/content/drive/MyDrive/programming/python/cross_word/prefectures/グンマ
/content/drive/MyDrive/programming/python/cross_word/prefectures/アイチ
/content/drive/MyDrive/programming/python/cross_word/prefectures/アオモリ
/content/drive/MyDrive/programming/python/cross_word/prefectures/カゴシマ
/content/drive/MyDrive/programming/python/cross_word/prefectures/イバラキ
/content/drive/MyDrive/programming/python/cross_word/prefectures/イシカワ
/content/drive/MyDrive/programming/python/cross_word/prefectures/アキタ
/content/drive/MyDrive/programming/python/cross_word/prefectures/イワテ
/content/drive/MyDrive/programming/python/cross_word/prefectures/エヒメ
/content/drive/MyDrive/programming/python/cross_word/prefectures/オオサカ
/content/drive/MyDrive/programming/python/cross_word/prefectures/オカヤマ
/content/drive/MyDri

In [48]:
labels = np.array(labels)

In [49]:
labels = keras.utils.to_categorical(labels,nb_classes)

In [None]:
np.save("./labels",labels)

In [None]:
make_dt()

In [None]:
save_dictionary("./dictionary.dic")

# ここからは収集済みデータを利用する場合に実行(labels.npyとdictionary.dicを用いる)

In [16]:
labels = np.load("./labels.npy")
nb_classes = 47
#作成済みのデータをロード
word_dictionary,dt,files = load_dictionary("./dictionary.dic")

In [18]:
datas = calculate_tf_idf()

In [19]:
datas = np.array(datas)

In [20]:
np.save("./datas",datas)

# ここからは機械学習を一から実装する場合にのみ実行
* 学習済みモデルを使用する場合は飛ばしてok

In [55]:
datas = np.load("./datas.npy")
labels = np.load("./labels.npy")

In [21]:
input_size = datas[0].shape[0]

In [22]:
inputs = keras.Input(shape=(input_size,))
x = layers.Dense(512,activation="relu")(inputs)
x = layers.Dropout(0.2)(x)
x = layers.Dense(512,activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(nb_classes,activation="softmax")(x)
model = keras.Model(inputs=inputs,outputs=outputs)

model.compile(optimizer="rmsprop",loss="categorical_crossentropy",metrics=["accuracy"])

In [64]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 47208)]           0         
                                                                 
 dense_3 (Dense)             (None, 512)               24171008  
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_4 (Dense)             (None, 512)               262656    
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 47)                24111     
                                                                 
Total params: 24457775 (93.30 MB)
Trainable params: 2445777

In [23]:
history = model.fit(datas,labels,batch_size=128,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
model.save('./cross_word_model.keras')
model.save_weights('./cross_word_weight.keras')

# 作成済みのデータ、モデルを使用する。




In [24]:
LABELS = ['トウキヨウ', 'サイタマ', 'カナガワ', 'グンマ', 'アイチ', 'アオモリ', 'カゴシマ', 'イバラキ', 'イシカワ', 'アキタ', 'イワテ', 'エヒメ', 'オオサカ', 'オカヤマ', 'カガワ', 'オキナワ', 'キヨウト', 'クマモト', 'ギフ', 'コウチ', 'サガ', 'チバ', 'シガ', 'シマネ', 'シズオカ', 'トクシマ', 'ワカヤマ', 'トチギ', 'トヤマ', 'トツトリ', 'ナガサキ', 'ニイガタ', 'ナガノ', 'ナラ', 'ヒヨウゴ', 'フクイ', 'フクオカ', 'ヒロシマ', 'フクシマ', 'ホツカイドウ', 'ミエ', 'ミヤギ', 'ヤマガタ', 'ヤマグチ', 'ヤマナシ', 'ミヤザキ', 'オオイタ']

In [68]:
#作成済みのデータをロード
word_dictionary,dt,files = load_dictionary("./dictionary.dic")

In [None]:
#作成済みのモデルを使用
model = keras.models.load_model('./cross_word_model.keras')
model.load_weights('./cross_word_weight.keras')

# モデルから予測を行う

In [25]:
#データに反映せずにtf-idfを計算(テストデータ用)
def calculate_text_tf_idf(text):
  result = np.zeros(word_dictionary['_id'])
  d = words_to_id(selectNVA(text),False)
  D = len(files)
  for i in range(word_dictionary["_id"]):
    tf = calculate_tf(i,d)
    idf = calculate_idf(i,D)
    tf_idf = tf * idf
    result[i] = tf_idf
  return result

In [26]:
def prediction(text):
  data = calculate_text_tf_idf(text)
  pre = model.predict(np.array([data]))[0]
  n = pre.argmax()
  print(LABELS[n],"(",pre[n],")")
  return LABELS[n],pre[n]

In [None]:
import re
import unicodedata

def join_diacritic(text, mode="NFC"):
    """
    基底文字と濁点・半濁点を結合
    """
    # str -> bytes
    bytes_text = text.encode()

    # 濁点Unicode結合文字置換
    bytes_text = re.sub(b"\xe3\x82\x9b", b'\xe3\x82\x99', bytes_text)
    bytes_text = re.sub(b"\xef\xbe\x9e", b'\xe3\x82\x99', bytes_text)

    # 半濁点Unicode結合文字置換
    bytes_text = re.sub(b"\xe3\x82\x9c", b'\xe3\x82\x9a', bytes_text)
    bytes_text = re.sub(b"\xef\xbe\x9f", b'\xe3\x82\x9a', bytes_text)

    # bytet -> str
    text = bytes_text.decode()

    # 正規化
    text = unicodedata.normalize(mode, text)

    return text

In [None]:
def can_put_w(x,y,word,board):
  count = 0
  if board[y][x] == '■':
    return False
  if (x == 0) or (board[y][x-1] == '■'):
    while (x < len(board[0])) and (board[y][x] != '■'):
      if count  == len(word):
        return False
      if board[y][x] != '□' and board[y][x] != word[count]:
        return False
      x += 1
      count += 1
    if count == len(word):
      return True
    else:
      return False
  return False


def can_put_h(x,y,word,board):
  count = 0
  if board[y][x] == '■':
    return False
  if (y == 0) or (board[y-1][x] == '■'):
    while (y < len(board)) and (board[y][x] != '■'):
      if count  == len(word):
        return False
      if board[y][x] != '□' and board[y][x] != word[count]:
        return False
      y += 1
      count += 1
    if count == len(word):
      return True
    else:
      return False
  return False


def put_w(x,y,word,board):
  if can_put_w(x,y,word,board) == True:
    count = 0
    while count < len(word):
      board[y][x] = word[count]
      count += 1
      x += 1
  return board


def put_h(x,y,word,board):
  if can_put_h(x,y,word,board) == True:
    count = 0
    while count < len(word):
      board[y][x] = word[count]
      count += 1
      y += 1
  return board


def max(l,k):
  sort = np.sort(l)
  return np.where(l == sort[-k])[0][0]


def make_answer_h(b,x,y,q):
  k = 1
  an = ""
  data = calculate_text_tf_idf(q)
  pre = model.predict(np.array([data]))[0]
  while k < 47:
    n = max(pre,k)
    an = LABELS[n]
    an = join_diacritic(an)
    if can_put_h(x,y,an,b):
      put_h(x,y,an,b)
      break
    k += 1
  return b


def make_answer_w(b,x,y,q):
  k = 1
  an = ""
  data = calculate_text_tf_idf(q)
  pre = model.predict(np.array([data]))[0]
  while k < 47:
    n = max(pre,k)
    an = LABELS[n]
    an = join_diacritic(an)
    if can_put_w(x,y,an,b):
      put_w(x,y,an,b)
      break
    k += 1
  return b

Gradioアプリにする

In [None]:
!pip install gradio



In [None]:
import gradio as gr
from tqdm import tqdm

In [None]:
title = "都道府県クロスワード"
description = "都道府県クロスワードを解くAIを作成しました。"
article = """boardにはクロスワードの盤面を入力します。文字を入れることができるマスには「□」、そうでないマスには「■」を入力します。盤面が複数行ある場合は改行で区切ってください。
questionsには「質問 縦のカギか横のカギか 一文字目が入るx座標 一文字目が入るy座標」を入力します。質問が複数行ある場合は改行で区切ります。縦のカギならh、横のカギならwを入力します。クロスワードの盤面は左上のx座標とy座標を0とし、一マス右に移動するとx座標が+1、一マス下に移動するとy座標が+1されるものとします。
これらを入力してsubmitボタンを押すとoutputの欄に回答が出力されます。
"""
example1_1 = "□□□□\n■□■■\n■□■■\n□□□□"
example1_2 = "浦和があるよ。 w 0 0\nアクアワールドがあるよ。 h 1 0\nちんすこうが有名 w 0 3"
example2_1 = "■■■□■\n□□□□□\n□■■□■\n□■■□■"
example2_2 = "日本の首都だよ。 w 0 1\n餃子が有名。 h 0 1\n神戸牛が食べられるよ。 h 3 0"

def make_str_board(board):
  result = ""
  for board_w in board:
    #board_w = "".join(board_w)
    #result += board_w.upper() + "\n"
    for letter in board_w:
      if letter not in ['■','□']:
        letter += " "
      result = result + letter
    result += "\n"
  return result

def greet(board,questions):
  progress = gr.Progress(track_tqdm=True)
  board = board.split("\n")
  board = list(map(list,board))
  questions = questions.split("\n")
  for question in questions:
    question = question.split(" ")
    if question[1] == "w":
      board = make_answer_w(board,int(question[2]),int(question[3]),question[0])
    if question[1] == "h":
      board = make_answer_h(board,int(question[2]),int(question[3]),question[0])
  str_board = make_str_board(board)
  return str_board


demo = gr.Interface(fn=greet,
                    inputs = ["text","text"],
                    outputs = "text",
                    examples = [[example1_1,example1_2],
                                [example2_1,example2_2]],
                    title = title,
                    description = description,
                    article = article
                    )
demo.queue()
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9e0d195d4b7d035aa8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


