from google.colab import userdata
HF_text = userdata.get('HF_text')
print(HF_text)

In [None]:
import re
import torch
import json
import pandas as pd # type: ignore
import torch.nn as nn
import torch.nn.functional as F
import collections
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizerFast,AutoModel,AutoConfig, get_cosine_schedule_with_warmup
from datasets import Dataset
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
from collections import Counter

In [None]:
# 取得額外資料
import time
import googlemaps
from googlemaps.exceptions import ApiError

gmaps = googlemaps.Client(key='填入API key')

# 定義速率限制與重試參數
REQUEST_INTERVAL = 0.6  # 每0.6秒發一次，保守些
MAX_RETRIES = 5         # 每個請求最多重試5次

def places_nearby_with_retry(location, radius, keyword, language='zh-Hant'):
    for attempt in range(MAX_RETRIES):
        try:
            result = gmaps.places_nearby(
                location=location,
                radius=radius,
                keyword=keyword,
                language=language
            )
            time.sleep(REQUEST_INTERVAL)  # 強制等待以防超速
            return result
        except ApiError as e:
            if e:
              print(f"[API Error] {e}")
            else:
                raise  # 其他API錯誤直接拋出
    print("Max retries exceeded.")
    return None


# 方圓 500公尺 緯度變化 0.0045 經度變化 0.0049
# 雙北範圍 約在 樹林 (24.980421, 121.409978) 雙北交界(25.096335, 121.623674)
N, E = 24.971421, 121.400178 #28.41842100000013 121.52757799999998
Types = ['Cold noodle restaurant', 'Mandarin restaurant', 'Hot pot restaurant',
         'Dim sum restaurant', 'Sichuan restaurant', 'Barbecue restaurant',
         'Seafood restaurant', 'Chinese restaurant', 'Noodle shop', 'Breakfast restaurant',
         'Porridge restaurant', 'Deli', 'Restaurant', 'American restaurant', 'Syokudo and Teishoku restaurant',
         'Japanese restaurant', 'Yakitori restaurant', 'Teppanyaki restaurant', 'Indian restaurant',
         'Taiwanese restaurant', 'Uyghur cuisine restaurant', 'Snack bar', 'Spanish restaurant', 'Chinese noodle restaurant',
         'Vietnamese restaurant', 'Cantonese restaurant', 'Asian restaurant', 'Sushi restaurant', 'Fine dining restaurant',
         'Hamburger restaurant', 'Pizza restaurant', 'Dumpling restaurant', 'Steamed bun shop', 'Italian restaurant',
         'Sukiyaki restaurant', 'Shabu-shabu restaurant', 'Jiangsu restaurant', 'Pie shop', 'Bistro', 'Hakka restaurant',
         'Chop bar', 'Malaysian restaurant', 'Korean restaurant', 'Beijing restaurant', 'Zhejiang restaurant','Juice shop',
         'Shanghainese restaurant', 'Pastry shop', 'Pho restaurant', 'Box lunch supplier', 'Chicken restaurant',
         'Hunan restaurant', 'Unagi restaurant', 'Korean barbecue restaurant', 'Mongolian barbecue restaurant',
         'Thai restaurant', 'Takeout restaurant', 'Fish restaurant'
        ]

# 樹林為起點向北向東搜尋餐廳
with open('/content/drive/MyDrive/place_ids.txt', 'a', encoding='utf-8') as f:
    for kw in Types:
        N_start = N
        for _ in range(0, 14):
            N += 0.009
            E_start = E  # 記錄每列起始E
            for j in range(0, 23):
                E += 0.0098
                result = places_nearby_with_retry(location=(N, E), radius=500, keyword=kw, language='zh-Hant')
                if result is None:
                    continue
                for place in result.get('results', []):
                    f.write(place['place_id'] + '\n')
                    f.flush()  # 每寫入一次立刻存檔
            E = E_start  # 下一列開始回到原本的E值
        N = N_start

In [None]:
import googlemaps
#導入資料
csv_path = "/content/drive/MyDrive/Res_data_set.csv"
df = pd.read_csv(csv_path)
Place_ID = df['Place_ID']


gmaps = googlemaps.Client(key='填入API key')
all_reviews =list()
for i in Place_ID:
  if i is not []:
    results = gmaps.place(i, language = 'zh-Hant')
    if 'result' in results:
        all_reviews.append(results['result'].get('reviews', []))  # 評論

#取得個別評論與評分
Reviews = list()
Ratings = list()

for i in range(len(all_reviews)):
    for j in range(len(all_reviews[i])):
        Reviews.append(all_reviews[i][j]['text'])
        Ratings.append(all_reviews[i][j]['rating'])

In [None]:
#儲存訓練資料為jsonl

data = [{"Reviews": Reviews}, {"Ratings": Ratings}]

with open("Train_data.jsonl", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
#讀取訓練資料

def clean_chinese_text(text):
    text = text.replace('\n', '').replace('\r', '').replace('\t', '')
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9.,!?;:()，。！？、；：（）【】「」『』《》〈〉\-\"\' ]", "", text)
    text = re.sub(r'([！？。，、～\-.])\1{1,}', r'\1', text)
    return text


data = []
with open("/content/drive/MyDrive/Train_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))  # 逐行解析 JSONL 物件

Reviews = [clean_chinese_text(i) for i in data[0]["Reviews"]] #前處理Reviews資料
Ratings = data[1]["Ratings"]

In [None]:
collections.Counter(Ratings)

Counter({5: 2256, 4: 1143, 1: 101, 2: 102, 3: 383})

In [None]:
#選擇模型
"""
bert-base-chinese
ckiplab/bert-base-chinese
hfl/chinese-roberta-wwm-ext（大部分中文任務都提升明顯）
hfl/chinese-macbert-base（在多個中文任務上比 BERT 表現更好）
IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment（情感分類專用模型）
"""
#從原來模型拿取參數
# config = BertConfig.from_pretrained("bert-base-chinese", num_labels=5)

#AutoConfig設定模型超參數
"""
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 5
config.problem_type = "single_label_classification"
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3
"""
#依照模型類別設定tokenizer
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
#指定模型任務
# model = BertForSequenceClassification.from_pretrained('bert-base-chinese', config=config, device_map="auto", torch_dtype="auto")

In [None]:
# EMD Loss 函數檢查
def weighted_emd_loss(preds, targets, class_weights, r=2):
    """
    preds: (batch_size, num_classes) — softmax output
    targets: (batch_size,) — true labels from 0 to num_classes-1
    class_weights: (num_classes,) — weight for each class
    """
    class_weights = class_weights.to(preds.device)
    cdf_preds = torch.cumsum(preds, dim=1)
    cdf_true = torch.cumsum(F.one_hot(targets, num_classes=preds.size(1)).float(), dim=1)
    abs_diff = torch.abs(cdf_preds - cdf_true)  # (batch_size, num_classes)
    # weighted_diff = abs_diff * class_weights  # broadcast over batch
    weighted_diff = abs_diff * class_weights.unsqueeze(0)
    emd = torch.mean(torch.pow(weighted_diff, r))
    return emd

# 模型建構（Ordinal Regression）
class BertForOrdinalRegression(nn.Module):
    def __init__(self, model_name, class_weights=None, config=None, num_labels=5):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name, config = config or AutoConfig.from_pretrained(model_name))
        self.dropout = nn.Dropout(0.1)
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.num_labels)
        if class_weights is not None:
            self.register_buffer("class_weights", class_weights)
        else:
            self.register_buffer("class_weights", torch.ones(self.num_labels))

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output) # shape: (batch_size, 5)
        probs = F.softmax(logits, dim=-1)

        if labels is not None:
            if labels.dtype != torch.long:
                labels = labels.long()
            loss = weighted_emd_loss(probs, labels, self.class_weights)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

# 預測星等
def logits_to_star_rating(logits):
    cdf = torch.cumsum(F.softmax(logits, dim=-1), dim=-1)  # 計算累積分佈
    predictions = torch.sum(cdf <= 0.5, dim=-1) + 1  # 這裡的 0.5 可以調整為合適的閾值
    return predictions

# 計算類別權重
def compute_class_weights(Ratings):
    raw_counter = collections.Counter(Ratings)
    label_counts = [raw_counter.get(i+1, 1) for i in range(5)]
    counts = torch.tensor(label_counts, dtype=torch.float)
    weights = 1.0 / counts
    normalized_weights = weights / weights.sum()
    return normalized_weights
class_weights = compute_class_weights(Ratings)

# 設定 Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('ckiplab/bert-base-chinese')
model = BertForOrdinalRegression(model_name='bert-base-chinese', class_weights=class_weights)


In [None]:
#定義函式將文字轉為 token 與 Dataset
def preprocess(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128)
    encodings['labels'] = [i-1 for i in labels] # 轉換 1~5 → 0~4
    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': encodings['labels']
    })


#建立 PyTorch Dataset
dataset = preprocess(Reviews, Ratings)
#將這欄位內資料自動轉成 torch.Tensor 格式
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
#分割dataset 成 training 與 validation
train_dataset, val_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()
#製作Dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
#設置Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# 類別權重 (weighted EMD Loss)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)

"""
#加權損失函數
# class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(train_dataset['labels'].tolist()), y=train_dataset['labels'].tolist())
# weights = torch.tensor(class_weights, dtype=torch.float).to(device)
# loss_fn = nn.CrossEntropyLoss(weight=weights)
"""
"""
# 建立類別加權
label_counts = Counter(Ratings)
class_counts = torch.tensor([label_counts[i] for i in range(1, 6)], dtype=torch.float)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum()
class_weights = class_weights.to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
"""

# 設置EarlyStopping
best_val_acc = 0.0
patience = 3
early_stop_counter = 0

#設置Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# 設置訓練迴圈
epochs = 10

#設置Scheduler
num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(num_training_steps * 0.1)  # 10% warmup
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)



for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        logits = outputs['logits']

        # loss = loss_fn(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

        # 預測類別
        preds = logits_to_star_rating(logits)

        # 收集結果用於 accuracy 計算
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy()+1)

    epoch_accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")
    print(classification_report(all_labels, all_preds, digits=4, target_names=["1星", "2星", "3星", "4星", "5星"]))

    # Early stopping + checkpoint
    if epoch_accuracy > best_val_acc:
        best_val_acc = epoch_accuracy
        early_stop_counter = 0
        torch.save(model.state_dict(), '/content/drive/MyDrive/bert_base_model.pth')
        print(f"New best model saved! Accuracy: {epoch_accuracy:.4f}")
    else:
        early_stop_counter += 1
        print(f"No improvement. EarlyStopCounter = {early_stop_counter}/{patience}")
        if early_stop_counter >= patience:
            print("Early stopping triggered!")
            break

In [None]:
# 評估模型

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.to(device)
model.eval()

all_preds = []
all_labels = []
batch_accuracies = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs['logits']
        preds = logits_to_star_rating(logits)

        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy()+1)

        print(classification_report(all_labels, all_preds, digits=4, target_names=["1星", "2星", "3星", "4星", "5星"]))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

          1星     0.0000    0.0000    0.0000         3
          2星     1.0000    0.5000    0.6667         2
          3星     0.0000    0.0000    0.0000         1
          4星     0.4000    0.1667    0.2353        12
          5星     0.5652    0.9286    0.7027        14

    accuracy                         0.5000        32
   macro avg     0.3930    0.3190    0.3209        32
weighted avg     0.4598    0.5000    0.4373        32

              precision    recall  f1-score   support

          1星     1.0000    0.2000    0.3333         5
          2星     0.5000    0.5000    0.5000         2
          3星     0.0000    0.0000    0.0000         1
          4星     0.6364    0.2414    0.3500        29
          5星     0.5319    0.9259    0.6757        27

    accuracy                         0.5312        64
   macro avg     0.5337    0.3735    0.3718        64
weighted avg     0.6065    0.5312    0.4853        64

              precisio