In [2]:
#part2: bert feature-base
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tfs
import warnings

warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('train.tsv', delimiter='\t', header=None)

In [4]:
train_set = train_df[:100]

print("Train set shape:", train_set.shape)
train_set[1].value_counts()

Train set shape: (100, 2)


1    60
0    40
Name: 1, dtype: int64

In [7]:
model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
train_tokenized = train_set[0].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [23]:
train_max_len = 0
for i in train_tokenized.values:
    train_max_len = max(train_max_len, len(i))

train_padded = np.array([i + [0] * (train_max_len-len(i)) for i in train_tokenized.values])
print("train set shape:",train_padded.shape)

train set shape: (100, 54)


In [28]:
print(train_padded[0])
train_attention_mask = np.where(train_padded !=0, 1, 0)
print(train_attention_mask[0])

[  101  1037 18385  1010  6057  1998  2633 18276  2128 16603  1997  5053
  1998  1996  6841  1998  5687  5469  3152   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [29]:
# 训练集
input_ids = torch.tensor(train_padded).long()
attention_mask = torch.tensor(train_attention_mask).long()
with torch.no_grad():
    train_last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [38]:
train_features = train_last_hidden_states['pooler_output'].numpy()
train_labels = train_set[1]

In [40]:
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels)

In [41]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [42]:
lr_clf.score(test_features, test_labels)

0.76

## bert fine tuned

In [5]:
import torch
from torch import nn, optim
import transformers
import torch.nn.functional as F
import math

In [19]:
class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
        self.base = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(768, 1)

    def forward(self,batch_sentences):
        text_dict = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
                                                    max_length=66, padding='max_length', return_tensors='pt')
        input_ids = text_dict['input_ids']
        attention_mask = text_dict['attention_mask']
        token_type_ids = text_dict['token_type_ids']
        output = self.base(input_ids, attention_mask, token_type_ids)
        output = self.linear(output['pooler_output'])
        return output

In [7]:
sentences = train_set[0].values
targets = train_set[1].values
train_inputs, test_inputs, train_targets, test_targets = train_test_split(sentences, targets)

batch_size = 64
batch_count = int(len(train_inputs) / batch_size)
batch_train_inputs, batch_train_targets = [], []
for i in range(batch_count):
    batch_train_inputs.append(train_inputs[i*batch_size : (i+1)*batch_size])
    batch_train_targets.append(train_targets[i*batch_size : (i+1)*batch_size])

In [22]:
#train the model
epochs = 3
lr = 0.01
print_every_batch = 1
model = BertClassificationModel()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    print_avg_loss = 0
    for i in range(batch_count):
        inputs = batch_train_inputs[i]
        labels = torch.tensor(batch_train_targets[i])
        optimizer.zero_grad()
        output = model(inputs)
        loss = F.binary_cross_entropy_with_logits(output.view(-1), torch.tensor(labels, dtype=float))
        loss.backward()
        optimizer.step()
        
        print_avg_loss += loss.item()
        if i % print_every_batch == (print_every_batch-1):
            print("Batch: %d, Loss: %.4f" % ((i+1), print_avg_loss/print_every_batch))
            print_avg_loss = 0
        

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
# eval the trained model
total = len(test_inputs)
hit = 0
with torch.no_grad():
    for i in range(total):
        outputs = bert_classifier_model([test_inputs[i]])
        _, predicted = torch.max(outputs, 1)
        if predicted == test_targets[i]:
            hit += 1

print("Accuracy: %.2f%%" % (hit / total * 100))

Accuracy: 90.53%
