In [1]:
import os
import json
import torch
import shutil
import numpy as np
from tqdm import tqdm

from my_utils import *
from sentence_luke_japanese import SentenceLukeJapanese
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 2080 Ti'

In [2]:
model_luke = SentenceLukeJapanese()
vector_length = model_luke.encode("このパソコン終わってます")[0].shape[0]

#
model_luke.encode("このパソコン終わってます")[0].shape

cuda


torch.Size([768])

In [None]:
review_dir = "jalan_review3/"
target_dir = "jalan_review_aligned_v3/"
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)
os.mkdir(target_dir)

min_reviews = 128

In [4]:
category_counts = dict()
category_vectors = dict()
prefecture_counts = dict()
prefecture_vectors = dict()

weighted_category_counts = dict()
weighted_category_vectors = dict()
weighted_prefecture_counts = dict()
weighted_prefecture_vectors = dict()

progress_bar = tqdm(os.listdir(review_dir))
for C in progress_bar:
    tmp_dir = f"{review_dir}{C}/"
    tmp_paths = [tmp_dir + TP for TP in os.listdir(tmp_dir)]

    for P in tmp_paths:
        if "last_page" in P: 
            continue

        with open(P, "r") as f:
            a = json.load(f)
        
        if len(a["reviews"]) < min_reviews:
            continue

        name = a["name"].replace("\"", "")
        category = C
        prefecture = extract_prefecture(a["adress"])

        if prefecture == None:
            continue

        progress_bar.set_postfix({"name" : name,
                                  "category" : category,
                                  "prefecture" : prefecture,
                                  "num_review" : len(a["reviews"])},
                                  refresh=True)

        spt_vector = np.zeros(shape = (vector_length))

        batch_size = 256
        for i in range(0, len(a["reviews"]), batch_size):
            last_idx = min(i + batch_size, len(a["reviews"]))
            tmp = [A["review"] for A in a["reviews"][i:last_idx]]
            tmp = model_luke.encode( tmp ).detach().numpy()

            spt_vector += np.sum(tmp, axis = 0)

            try:
                category_vectors[category] += np.sum(tmp, axis = 0)
                category_counts[category] += last_idx - i
            except:
                category_vectors[category] = np.sum(tmp, axis = 0)
                category_counts[category] = last_idx - i

            try:
                prefecture_vectors[prefecture] += np.sum(tmp, axis = 0)
                prefecture_counts[prefecture] += last_idx - i
            except:
                prefecture_vectors[prefecture] = np.sum(tmp, axis = 0)
                prefecture_counts[prefecture] = last_idx - i
            
            try:
                weighted_category_vectors[category] += np.sum(tmp / len(a["reviews"]), axis = 0)
            except:
                weighted_category_vectors[category] = np.sum(tmp / len(a["reviews"]), axis = 0)

            try:
                weighted_prefecture_vectors[prefecture] += np.sum(tmp / len(a["reviews"]), axis = 0)
            except:
                weighted_prefecture_vectors[prefecture] = np.sum(tmp / len(a["reviews"]), axis = 0)
            
            torch.cuda.empty_cache()
        
        spt_vector /= len(a["reviews"])
        with open(f"{target_dir}{name}_{category}_{prefecture}.json", "w") as f:
            json.dump({
                "name" : name,
                "category" : category,
                "prefecture" : prefecture,
                "vector" : spt_vector.tolist()
            }, f)
        
        try:
            weighted_category_counts[category] += 1
        except:
            weighted_category_counts[category] = 1
        
        try:
            weighted_prefecture_counts[prefecture] += 1
        except:
            weighted_prefecture_counts[prefecture] = 1

# 重み付け無しカテゴリ・都道府県ベクトルの保存
if not os.path.exists(f"{target_dir}categories/"):
    os.mkdir(f"{target_dir}categories/")

for C in category_vectors.keys():
    with open(f"{target_dir}categories/{C}.json", "w") as f:
        json.dump({
            "vector" : (category_vectors[C] / category_counts[C]).tolist(),
            "category" : C
        }, f)

if not os.path.exists(f"{target_dir}prefectures/"):
    os.mkdir(f"{target_dir}prefectures/")

for P in prefecture_vectors.keys():
    with open(f"{target_dir}prefectures/{P}.json", "w") as f:
        json.dump({
            "vector" : (prefecture_vectors[P] / prefecture_counts[P]).tolist(),
            "prefecture" : P
        }, f)

# 重み付け有りカテゴリ・都道府県ベクトルの保存
if not os.path.exists(f"{target_dir}weighted_categories/"):
    os.mkdir(f"{target_dir}weighted_categories/")

for C in weighted_category_vectors.keys():
    with open(f"{target_dir}weighted_categories/{C}.json", "w") as f:
        json.dump({
            "vector" : (weighted_category_vectors[C] / weighted_category_counts[C]).tolist(),
            "category" : C
        }, f)

if not os.path.exists(f"{target_dir}weighted_prefectures/"):
    os.mkdir(f"{target_dir}weighted_prefectures/")

for P in weighted_prefecture_vectors.keys():
    with open(f"{target_dir}weighted_prefectures/{P}.json", "w") as f:
        json.dump({
            "vector" : (weighted_prefecture_vectors[P] / weighted_prefecture_counts[P]).tolist(),
            "prefecture" : P
        }, f)

100%|██████████| 17/17 [35:31<00:00, 125.38s/it, name=照葉大吊橋, category=近代建築, prefecture=宮崎県, num_review=162]                                                                            
