In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("../output/comments/comments_Cleaned.csv")

In [None]:
import torch
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertModel.from_pretrained("bert-base-chinese")
model.eval()
model = model.to(device)

In [None]:
comments = df["content_clean"].tolist()
comments

In [None]:
comments_lte_128, comments_gt_128_lte_256, comments_gt_256 = [], [], []
for comment in comments:
    if pd.isna(comment):
        pass
    elif len(comment) <= 128:
        comments_lte_128.append(comment)
    elif len(comment) <= 256:
        comments_gt_128_lte_256.append(comment)
    else:
        comments_gt_256.append(comment)

In [None]:
batch_size = 10000
for i in range(0, len(comments_lte_128), batch_size):
    batch_end = min(i + batch_size, len(comments_lte_128))
    batch = comments_lte_128[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=128,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()

            record = {"text": text}
            for j, val in enumerate(cls_embedding):
                record[f"v{j}"] = val
            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]} — {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_lte128_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()

In [None]:
batch_size = 10000
for i in range(0, len(comments_gt_128_lte_256), batch_size):
    batch_end = min(i + batch_size, len(comments_gt_128_lte_256))
    batch = comments_gt_128_lte_256[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=256,
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()

            record = {"text": text}
            for j, val in enumerate(cls_embedding):
                record[f"v{j}"] = val
            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]} — {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_gt128_lte256_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()

In [None]:
batch_size = 10000
for i in range(0, len(comments_gt_256), batch_size):
    batch_end = min(i + batch_size, len(comments_gt_256))
    batch = comments_gt_256[i:batch_end]
    batch_records = []

    for text in tqdm(batch, desc=f"Batch {i // batch_size + 1}"):
        try:
            inputs = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_attention_mask=True,
            )
            inputs = {
                k: v.to(device)
                for k, v in inputs.items()
                if k in ["input_ids", "attention_mask"]
            }

            cls_embeddings = []
            for chunk_idx in range(inputs["input_ids"].shape[0]):
                input_ids = inputs["input_ids"][chunk_idx].unsqueeze(0)
                attention_mask = inputs["attention_mask"][chunk_idx].unsqueeze(0)

                with torch.no_grad():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    cls_vec = outputs.last_hidden_state[:, 0, :]
                    cls_embeddings.append(cls_vec.squeeze().cpu().numpy())

            final_embedding = np.mean(cls_embeddings, axis=0)

            record = {"text": text}
            for j, val in enumerate(final_embedding):
                record[f"v{j}"] = val

            batch_records.append(record)

        except Exception as e:
            print(f"Error on text: {text[:30]}... => {e}")
            continue

    df = pd.DataFrame(batch_records)
    df.to_parquet(f"../output/comments/comment_gte256_part_{i // batch_size + 1}.parquet")

    del batch_records
    torch.cuda.empty_cache()