In [1]:
from models.model import Model
from models.utils import Tokenizer, TextClassificationDataset, train_val_split, get_loader
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter
from utils import accuracy
from tqdm import tqdm
import os
import time

### data -> tokenizer -> encoding -> dataset -> dataloader ###

### data ###

In [36]:
# 1. 准备数据集
train_data_path = "../data/train_data4type(utf-8).csv"
test_data_path = "../data/test_data4type(ansi).csv"
df_train = pd.read_csv(train_data_path, encoding='utf-8')
df_test = pd.read_csv(test_data_path, encoding='ansi')
train_X, train_y = list(df_train['combinedText']), list(df_train['type'])
test_X, test_y = list(df_test['combinedText']), list(df_test['type'])

### tokenizer -> encoding ###

In [37]:
tokenizer = Tokenizer()
train_text, val_text, train_labels, val_labels = train_val_split(train_X, train_y)
test_text, _, test_labels, _ = train_val_split(test_X, test_y, test_size=0)
train_encodings = tokenizer(train_text)
val_encodings = tokenizer(val_text)
test_encodings = tokenizer(test_text)

### encoding -> dataset -> dataloader ###

In [38]:
train_dataset = TextClassificationDataset(encodings=train_encodings, labels=train_labels)
val_dataset = TextClassificationDataset(encodings=val_encodings, labels=val_labels)
test_dataset = TextClassificationDataset(encodings=test_encodings, labels=test_labels)

### model -> train_loop ###

In [6]:
model = Model(model_path="../models/bert-base-chinese")

Some weights of the model checkpoint at ../models/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model che

In [9]:
def train(train_dataset, val_dataset, model, batch_size, num_epochs, checkpoint=False, step=100, learning_rate=1e-5):
    # DataLoader
    train_loader = get_loader(train_dataset, batch_size)
    val_loader = get_loader(val_dataset, batch_size)

    # device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Model
    if checkpoint is not False:
        checkpoint = torch.load(checkpoint)
        model.load_state_dict(checkpoint)
    model.to(device)
    model.train()

    # Optimizer
    optim = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # SummaryWriter
    datetime_str = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    logs_path = "logs/" + datetime_str
    writer1 = SummaryWriter(f"{logs_path}/train_loss")
    writer2 = SummaryWriter(f"{logs_path}/test_loss")
    writer3 = SummaryWriter(f"{logs_path}/train_acc")
    writer4 = SummaryWriter(f"{logs_path}/test_acc")

    # train_loop
    for epoch in range(num_epochs):
        model.train()
        for i, batch in tqdm(enumerate(train_loader), total=int(len(train_dataset)/batch_size)):
            if i % 1 == 0:
                checkpoint_name = f'model_checkpoint_utf8_{i}.pth'
                torch.save(model.state_dict(), os.path.join(r'../models/checkpoints', checkpoint_name))
            if i > step:
                break
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optim.step()
            with torch.no_grad():
                logits = outputs.logits
                train_acc = accuracy(logits, labels)
                writer1.add_scalar("loss-step-train", loss, i)
                writer3.add_scalar("acc-step-train", train_acc, i)
        with torch.no_grad():
            model.eval()
            for i, batch in tqdm(enumerate(val_loader), total=int(len(val_dataset)/batch_size)):
                if i > step:
                    break
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                test_acc = accuracy(logits, labels)
                writer2.add_scalar("loss-step-test", loss, i)
                writer4.add_scalar("acc-step-test", test_acc, i)
    writer1.close()
    writer2.close()
    writer3.close()
    writer4.close()

In [10]:
train(train_dataset, val_dataset, model, 16, 1, False, 50, 1e-5)

51it [01:17,  1.53s/it]
51it [00:13,  3.89it/s]


### inference ###

In [11]:
def inference4acc(test_dataset, model, checkpoint_file_path, batch_size, shuffle=True):
    # loader
    test_loader = get_loader(test_dataset, batch_size, shuffle)

    # device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # load model
    checkpoint_file = checkpoint_file_path
    checkpoint = torch.load(checkpoint_file)
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()

    # inference
    predictions, targets= [], []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    predictions = torch.tensor(predictions)
    targets = torch.tensor(targets)
    acc = accuracy(predictions, targets)

    return acc

In [39]:
inference4acc(test_dataset, model, f"../models/checkpoints/model_checkpoint_utf8_{40}.pth", 16)

100%|██████████| 290/290 [01:02<00:00,  4.61it/s]


0.8422303868597363

In [13]:
l1, sum= [], 0
# model = Model()
for i in [str(i*1) for i in range(1, 11)]:
    l1.append(inference4acc(test_dataset, model, f"../models/checkpoints/model_checkpoint_utf8_{i}.pth", 16))
for _ in l1:
    sum += _
sum/len(l1)

100%|██████████| 290/290 [01:14<00:00,  3.90it/s]
100%|██████████| 290/290 [01:08<00:00,  4.21it/s]
100%|██████████| 290/290 [01:01<00:00,  4.69it/s]
100%|██████████| 290/290 [01:02<00:00,  4.68it/s]
100%|██████████| 290/290 [01:02<00:00,  4.67it/s]
100%|██████████| 290/290 [01:02<00:00,  4.67it/s]
100%|██████████| 290/290 [01:01<00:00,  4.69it/s]
100%|██████████| 290/290 [01:01<00:00,  4.69it/s]
100%|██████████| 290/290 [01:01<00:00,  4.68it/s]
100%|██████████| 290/290 [01:02<00:00,  4.67it/s]


0.6211368057056408

In [14]:
sum = 0
for _ in l1[1:]:
    sum += _
sum/(len(l1)-1)

0.645510650049228

In [15]:
l1

[0.4017722066133564,
 0.4640155608385563,
 0.46271882429219796,
 0.5076723578992868,
 0.58288307758807,
 0.6913767019667171,
 0.7434622865787768,
 0.7724227361141128,
 0.7910092932785823,
 0.7940350118867516]