In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("../output/comments/comments_Cleaned.csv")

In [None]:
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
torch.cuda.is_available()

True

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertModel.from_pretrained("bert-base-chinese")
model.eval()
model = model.to(device)

In [18]:
comments = df["content_clean"].tolist()
comments

['他們家的餐點價格實惠、份量也足夠。非常推薦😄 …',
 '適合跟朋友聊天聚會的好地方 食物份量偏少，但烏龍豆乾好吃～ 珍奶是古早味',
 '傳說中的泡沫紅茶老店 原來就在一中街附近欸 話說以前曾經經過而已 …',
 '漲價漲這麼兇也就罷了 開發票還愛開不開  不問就不開 是想被查是不是',
 '這是我們老台中人懷念的老味道帶朋友去吃他讚不絕口因為他是豆乾王他說無與倫比的好吃給一百分',
 '好爛 薛店 東西不好吃',
 '評價餐飲',
 '從小看到大 今天終於第一次來啦 真的超棒耶 …',
 '對煙味敏感，門口座椅區抽煙的客人煙味直接飄進來全店，似乎沒有躲藏的地方。大門開著的，裡面外面似乎沒有隔開的意義了',
 '與朋友聚會好地方，小食很開胃，個人喜歡白蘭地紅茶與芬蘭果汁。',
 '外帶ㄧ塊黑豆干70元、被搶劫的感覺',
 nan,
 '在地老店,餐點多,味道好,燈光美,氣氛佳,若不要漏開發票逃漏稅的話會更好',
 '小孩想要睡覺，讓他趴下來，服務人員過來告知這裡不能睡覺，不太懂這個規定',
 '物價上漲可理解 房租漲價可理解  但是這次真的差很多  炒飯份量比路邊攤還少 味道也沒好到哪裡 還不如買路邊攤回家吃  不會再來 反正在貴的店都有人會來 那我寧可選 春水堂吧。翁只剩豆干 厚片 能吃了！',
 nan,
 '太貴',
 nan,
 '懷念的好地方，豆乾超好吃',
 '食物好吃 一樣的味道 現在到底有開發票還是沒開發票？ …',
 nan,
 '記憶中的忘記豆乾最好吃',
 '好貴。 一點心70元、飲料也差不多價位，而且份量都不多。 兩個人吃兩杯茶四樣點心440。',
 '這吃的應該是時代的眼淚吧！ 可能我剛好嘴破所以吃豆乾被辣到了 但是芬蘭果汁很好喝～ 牛肉燴飯就一般般 臨時起意想吃泡沫紅茶店 結果幾年前常吃的綠＊茶居然全部收掉了 不過還好翁記還在🤣🤣🤣',
 nan,
 '很傳統，飲料很不錯，辣味豆乾入味也夠辣。惟價格稍高，但仍值得一試',
 '老店賺飽了 態度有夠差  吃的也ㄧ般般宵張什麼',
 nan,
 '烏龍豆干好好吃',
 '喜歡跟朋友一起去的地方，吃點心喝茶豆干必點👍 …',
 '平常不喜歡吃豆乾 但這家辣烏龍豆乾無敵 一個人可以吃三個',
 '豆干不能錯過 不過辣度很高 不是吃很辣的人可以考慮微辣就好',
 '台中老字號茶藝館！飲料好喝又

In [None]:
comments_lte_128, comments_gt_128_lte_256, comments_gt_256 = [], [], []
for comment in comments:
    if pd.isna(comment):
        pass
    elif len(comment) <= 128:
        comments_lte_128.append(comment)
    elif len(comment) <= 256:
        comments_gt_128_lte_256.append(comment)
    else:
        comments_gt_256.append(comment)

In [None]:
batch_size = 10000
for i in range(0, len(comments_lte_128), batch_size):
    batch_end = min(i + batch_size, len(comments_lte_128))
    batch = comments_lte_128[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=128,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()

            record = {"text": text}
            for j, val in enumerate(cls_embedding):
                record[f"v{j}"] = val
            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]} — {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_lte128_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()

Batch 1: 100%|██████████| 10000/10000 [02:19<00:00, 71.56it/s]
Batch 2: 100%|██████████| 10000/10000 [02:23<00:00, 69.55it/s]
Batch 3: 100%|██████████| 10000/10000 [02:23<00:00, 69.63it/s]
Batch 4: 100%|██████████| 10000/10000 [02:24<00:00, 69.20it/s]
Batch 5: 100%|██████████| 10000/10000 [02:19<00:00, 71.82it/s]
Batch 6: 100%|██████████| 10000/10000 [02:22<00:00, 70.24it/s]
Batch 7: 100%|██████████| 10000/10000 [02:29<00:00, 66.93it/s]
Batch 8: 100%|██████████| 10000/10000 [02:27<00:00, 67.61it/s]
Batch 9: 100%|██████████| 10000/10000 [02:26<00:00, 68.38it/s]
Batch 10: 100%|██████████| 10000/10000 [02:31<00:00, 65.85it/s]
Batch 11: 100%|██████████| 10000/10000 [02:27<00:00, 67.70it/s]
Batch 12: 100%|██████████| 10000/10000 [02:21<00:00, 70.63it/s]
Batch 13: 100%|██████████| 10000/10000 [02:22<00:00, 70.05it/s]
Batch 14: 100%|██████████| 10000/10000 [02:14<00:00, 74.51it/s]
Batch 15: 100%|██████████| 10000/10000 [02:17<00:00, 72.70it/s]
Batch 16: 100%|██████████| 10000/10000 [02:18<00:

In [None]:
batch_size = 10000
for i in range(0, len(comments_gt_128_lte_256), batch_size):
    batch_end = min(i + batch_size, len(comments_gt_128_lte_256))
    batch = comments_gt_128_lte_256[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=256,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()

            record = {"text": text}
            for j, val in enumerate(cls_embedding):
                record[f"v{j}"] = val
            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]} — {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_gt128_lte256_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()

Batch 1: 100%|██████████| 10000/10000 [03:45<00:00, 44.25it/s]
Batch 2: 100%|██████████| 9044/9044 [03:27<00:00, 43.60it/s]


In [None]:
batch_size = 10000
for i in range(0, len(comments_gt_256), batch_size):
    batch_end = min(i + batch_size, len(comments_gt_256))
    batch = comments_gt_256[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_attention_mask=True,
            )
            inputs = {
                k: v.to(device)
                for k, v in inputs.items()
                if k in ["input_ids", "attention_mask"]
            }

            cls_embeddings = []
            for chunk_idx in range(inputs["input_ids"].shape[0]):
                input_ids = inputs["input_ids"][chunk_idx].unsqueeze(0)
                attention_mask = inputs["attention_mask"][chunk_idx].unsqueeze(0)

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    cls_vec = outputs.last_hidden_state[:, 0, :]
                    cls_embeddings.append(cls_vec.squeeze().cpu().numpy())

            final_embedding = np.mean(cls_embeddings, axis=0)

            record = {"text": text}
            for j, val in enumerate(final_embedding):
                record[f"v{j}"] = val

            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]}... => {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_gte256_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()

Batch 1: 100%|██████████| 5638/5638 [03:42<00:00, 25.35it/s]
