# 文本分类

In [4]:
from util import init_torch_device


device = init_torch_device()

In [27]:
# import part
import numpy as np
# dataset management
from datasets import load_dataset
# model
from sklearn.linear_model import LogisticRegression
# pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline

# evaluate
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
# 进度条
from tqdm import tqdm
# 调试
from rich import inspect

# openai （deepseek 兼容）
import openai

In [6]:
!python -m pip install datasets



In [7]:

data = load_dataset("rotten_tomatoes")


In [8]:
# 检查数据格式

inspect(data)

## 模型选择

* 常见选择：
  * BERT base ci
  * RoBERTa base
  * DistilBERT base ci
  * DeBERTa base
  * bert-tiny
  * ALBERT base v2

`Twitter-roBERTa-base for Sentiment Analysis`



In [9]:

# 使用特定任务模型
model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

pipe = pipeline(
    model=model_name,
    tokenizer=model_name,
    return_all_scores=True,
    device=device.type
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


In [10]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data['test'], 'text')), total=len(data['test'])):
    negative_score = output[0]['score']
    positive_score = output[2]['score']
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)


100%|██████████| 1066/1066 [00:13<00:00, 76.22it/s]


In [11]:
# evaluation
def evaluate_performance(y_true, y_pred):
    performance = classification_report(
        y_true, y_pred,
        target_names=['Negative Review', 'Positive Review']
    )
    print(performance)


In [12]:
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



In [13]:
# 通用嵌入模型 + 训练分类器
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

train_embeddings = model.encode(data['train']['text'], show_progress_bar=True)
test_embeddings = model.encode(data['test']['text'], show_progress_bar=True)

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [14]:
train_embeddings.shape


(8530, 768)

In [15]:

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data['train']['label'])



  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


In [16]:
y_pred = clf.predict(test_embeddings)
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



  ret = a @ b
  ret = a @ b
  ret = a @ b


In [17]:
# 零样本训练

# 标签嵌入
label_embeddings = model.encode(['A negative review', 'A positive review'])

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
inspect(sim_matrix)

y_pred = np.argmax(sim_matrix, axis=1)
evaluate_performance(data['test']['label'], y_pred)

  ret = a @ b
  ret = a @ b
  ret = a @ b


                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



In [18]:
# T5 text-to-text transfer transformer
# encoder-decoder 预留
model_type = 'text2text-generation'
model_name = 'google/flan-t5-small'
pipe = pipeline(
    model_type,
    model=model_name,
    device=device.type
)

Device set to use mps


## T5

### 训练步骤

1. 掩码语言建模预训练。特点：**词元跨度（token span）**掩码，而不是传统的单词元掩码
2. 任务转为序列到序列任务并同时训练。即，任务转为文本指令

类似与提示工程的方式

In [21]:
# text classification task
prompt = 'Is the following review positive or negative?'
data = data.map(lambda item: {'t5': f'{prompt} {item["text"]}'})

data
data['train'][0]

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1,
 't5': 'Is the following review positive or negative? the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}

In [24]:
text_pred = []
for output in tqdm(pipe(KeyDataset(data['test'], 't5')), total=len(data['test'])):
    text_pred.append(output[0]['generated_text'])

y_pred = [0 if text == 'negative' else 1 for text in text_pred]

not_except_text = [text for text in text_pred if text not in ('negative', 'positive')]
print(len(not_except_text))

100%|██████████| 1066/1066 [01:19<00:00, 13.39it/s]

0





In [25]:
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.87      0.85       533
Positive Review       0.86      0.82      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



- [ ] 用 deepseek 接口测试

In [26]:
!python -m pip install openai



In [28]:
# deepseek
from deepseek import base_url, api_key

client = openai.OpenAI(base_url=base_url, api_key=api_key)


In [32]:
def deepseek_chat(prompt, document, model='deepseek-chat'):
    messages = [{
        'role': 'system',
        'content': 'You are a helpful assistant.'
    }, {
        'role': 'user',
        'content': prompt.replace('[DOCUMENT]', document)
    }]
    chat_completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0
    )
    return chat_completion.choices[0].message.content


In [30]:
prompt = '''Predict whether the following review is a positive or negative move review:

[DOCUMENT]

If it is a positie review return 1 and if it is a negative review return 0. Do not give any other answers.
'''

In [34]:
# debug
document = 'unpretentious, charming, quirky, original'
deepseek_chat(prompt, document)

'1'

In [None]:
# 以下在闲时 （00：30～08：30） 测试
predications = [
    deepseek_chat(prompt, review) for review in tqdm(data['test']['text'], total=len(data['test']))
]

not_except_pred = [pred for pred in predications if pred not in ('0', '1')]
print(len(not_except_pred))

y_pred = [0 if pred == '0' else 1 for pred in predications]
evaluate_performance(data['test']['label'], y_pred)
