# 1.导入需要的工具包

In [None]:
import numpy as np
import pandas as pd
import random
import logging
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

# 2.载入数据

In [None]:
df = pd.read_csv('./train.tsv', delimiter='\t', header=None)
# 为做示例只取前2000条数据
batch_1 = df[:2000]
# 查看正负例的数量
batch_1[1].value_counts()

# 3.载入预训练模型

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# 4.数据预处理

## 4.1 分词

In [None]:
tokenized = batch_1[0].apply((lambda x:tokenizer.encode(x, add_special_tokens = True)))

## 4.2 padding

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
np.array(padded).shape
# Masking
# attention_mask（也就是input_mask）的0值只作用在padding部分
# np.where(condition, x, y) 满足条件(condition)，输出x，不满足输出y
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

# 5.使用BERT

In [None]:
# 基本可以看作又进行了一次embedding
input_ids = torch.LongTensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
last_hidden_states[0].shape
features = last_hidden_states[0][:,0,:].numpy()
features.shape
labels = batch_1[1] # 取出标签

# 6.用机器学习的方法训练一个分类器

## 6.1划分训练集和测试集

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

In [None]:
lr_clf = LogisticRegression(C = 10.526405263157894)
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

# 7.结果评估

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# 8. 进行fine-tuning