In [8]:
import os
import sys
import logging
import time

import pandas as pd
import torch.optim as optim

import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

train = pd.read_csv("/content/labeledTrainData.tsv", header=0,
                    delimiter="\t", quoting=3)
test = pd.read_csv("/content/testData.tsv", header=0,
                    delimiter="\t", quoting=3)


class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        if labels:
            self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, num_samples=0):
        self.encodings = encodings
        self.num_samples = num_samples

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.num_samples


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    train_texts, train_labels, test_texts = [], [], []
    for i, review in enumerate(train["review"]):
        train_texts.append(review)
        train_labels.append(train['sentiment'][i])

    for review in test['review']:
        test_texts.append(review)

    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = TrainDataset(train_encodings, train_labels)
    val_dataset = TrainDataset(val_encodings, val_labels)
    test_dataset = TestDataset(test_encodings, num_samples=len(test_texts))

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=24, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False)

    optim = optim.AdamW(model.parameters(), lr=5e-5)

    for epoch in range(3):
        start = time.time()
        train_loss, val_losses = 0, 0
        train_acc, val_acc = 0, 0
        n, m = 0, 0

        with tqdm(total=len(train_loader), desc="Epoch %d" % epoch) as pbar:
            for batch in train_loader:
                n += 1
                optim.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optim.step()
                train_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
                train_loss += loss.cpu().item()  # 修改这里：使用.item()获取标量值

                # 修复这里：删除了错误的字符串，使用正确的格式化
                pbar.set_postfix({
                    'epoch': '%d' % (epoch),
                    'train loss': '%.4f' % (train_loss / n),
                    'train acc': '%.2f' % (train_acc / n)
                })
                pbar.update(1)

            with torch.no_grad():
                for batch in val_loader:
                    m += 1
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss = outputs.loss
                    val_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
                    val_losses += val_loss.cpu().item()  # 修改这里：使用.item()获取标量值

            end = time.time()
            runtime = end - start

            # 修复最终的进度条显示
            pbar.set_postfix({
                'epoch': '%d' % (epoch),
                'train loss': '%.4f' % (train_loss / n),
                'train acc': '%.2f' % (train_acc / n),
                'val loss': '%.4f' % (val_losses / m),
                'val acc': '%.2f' % (val_acc / m),
                'time': '%.2f' % (runtime)
            })

    test_pred = []
    with torch.no_grad():
        with tqdm(total=len(test_loader), desc='Prediction') as pbar:
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                test_pred.extend(torch.argmax(outputs.logits.cpu().data, dim=1).numpy().tolist())
                pbar.update(1)

    result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
    result_output.to_csv("./result/distilbert_native.csv", index=False, quoting=3)
    logging.info('result saved!')

INFO:colab_kernel_launcher.py:running /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py-f/root/.local/share/jupyter/runtime/kernel-cd3dbb1d-3838-4131-85e4-419c8c998804.json
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████| 1667/1667 [17:21<00:00,  1.60it/s, epoch=0, train loss=0.2770, train acc=0.89, val loss=0.2216, val acc=0.91, time=1041.03]
Epoch 1: 100%|██████████| 1667/1667 [17:26<00:00,  1.59it/s, epoch=1, train loss=0.1486, train acc=0.94, val loss=0.2486, val acc=0.91, time=1046.69]
Epoch 2: 100%|██████████| 1667/1667 [17:25<00:00,  1.59it/s, epoch=2, train loss=0.0781, train acc=0.97, val loss=0.2585, val acc=0.92, time=1045.23]
Predictio

OSError: Cannot save file into a non-existent directory: 'result'

In [6]:
! unzip /content/labeledTrainData.tsv.zip

Archive:  /content/labeledTrainData.tsv.zip
  inflating: labeledTrainData.tsv    
