In [None]:
from google.colab import files
files.upload() #upload kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"riurik51","key":"f0e27f5ac318208f9d9a6e0dfb3c65f6"}'}

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c litbank-ozon-2020

Downloading train_sents.csv.zip to /content
  0% 0.00/387k [00:00<?, ?B/s]
100% 387k/387k [00:00<00:00, 51.0MB/s]
Downloading test_sents_without_answers.csv to /content
  0% 0.00/437k [00:00<?, ?B/s]
100% 437k/437k [00:00<00:00, 71.6MB/s]


In [None]:
!unzip '/content/train_sents.csv.zip'

Archive:  /content/train_sents.csv.zip
  inflating: train_sents.csv         


In [None]:
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
train_list = []
with open('train_sents.csv') as file:
    file.readline()
    cur_sent_list = []
    cur_tags_list = []
    for line in file:
        if line == '\n':
            train_list.append([cur_sent_list, cur_tags_list])
            cur_sent_list = []
            cur_tags_list = []
        else:
            if len(line.split(',')) == 2:
                cur_list = line[:-1].split(',')
                cur_sent_list.append(cur_list[0])
                if cur_list[1] == 'O':
                    cur_tags_list.append(cur_list[1])
                else:
                    cur_tags_list.append(cur_list[1])
            else:
                cur_sent_list.append(',')
                cur_tags_list.append('O')

In [None]:
df = pd.read_csv('train_sents.csv')

In [None]:
names = np.unique(df['tag'])

# names = [
#          'O',
#          'FAC',
#          'GPE',
#          'LOC',
#          'ORG',
#          'PER',
#          'VEH'
# ]

In [None]:
id_to_name = {}
for i, x in enumerate(names):
    id_to_name[i] = x

In [None]:
name_to_id = {}
for x in id_to_name:
    name_to_id[id_to_name[x]] = x

In [None]:
for value in train_list:
   value[1] = list(map(lambda x: name_to_id[x], value[1]))

In [None]:
from gensim.models import word2vec

In [None]:
train_tokens = []
for val in train_list:
    train_tokens.append(val[0])

In [None]:
test_list = []
with open('/content/test_sents_without_answers.csv') as file:
    file.readline()
    cur_sent_list = []
    for line in file:
        if line == '\n':
            test_list.append(cur_sent_list)
            cur_sent_list = []
        else:
            if len(line.split(',')) == 2:
                cur_list = line[:-1].split(',')
                cur_sent_list.append(cur_list[1])
            else:
                cur_sent_list.append(',')

#BILSTM

In [None]:
w2v = word2vec.Word2Vec(train_tokens + test_list,
                        workers=4,
                        size=300,
                        min_count=0)

In [None]:
class NER_Train_Dataset(Dataset):
    def __init__(self, train_list, w2v_model, max_sentence_lenght, num_features):
        #embedding dim = 300
        self.w2v_model = w2v_model
        self.num_features = num_features
        lens_to_sent_dict = {}
        for x in train_list:
            if len(x[0]) in lens_to_sent_dict.keys():
                lens_to_sent_dict[len(x[0])].append(x)
            else:
                lens_to_sent_dict[len(x[0])] = [x]
        self.lens_to_sent_dict = lens_to_sent_dict
        self.idxes = list(lens_to_sent_dict.keys())


    
    def __len__(self):
        return len(self.idxes)
    
    def __getitem__(self, idx):
        curr_list = self.lens_to_sent_dict[self.idxes[idx]]
        curr_labels = []
        curr_tokens = []
        for x in curr_list:
            curr_tokens.append([])
            for word in x[0]:
                curr_tokens[-1].append(self.w2v_model[word])
            curr_labels.append(x[1])
        return torch.FloatTensor(curr_tokens), torch.LongTensor(curr_labels)

In [None]:
train_set = NER_Train_Dataset(train_list[:50 * 113], w2v, 307, 300)
val_set = NER_Train_Dataset(train_list[50 * 113:], w2v, 307, 300)

In [None]:
class My_Loss(nn.Module):
    def __init__(self, k):
        super(My_Loss, self).__init__()
        self.criterion = nn.CrossEntropyLoss(reduce=False)
        self.k = k
    
    def forward(self, pred, true):
        losses = self.criterion(pred.view(-1, pred.shape[2]),
                             true.view(-1).to(device)).view(true.shape[0], true.shape[1], -1)
        pos_loss = losses[true > 0].sum()
        num_pos = (true > 0).sum().cpu().item()
        num_neg = num_pos * self.k
        if num_neg > len(losses[true == 0].view(-1)):
            num_neg = len(losses[true == 0].view(-1))
        neg_loss = torch.topk(losses[true == 0].view(-1), num_neg)[0].sum()
        if num_pos + num_neg == 0:
            return pos_loss + neg_loss
        return (pos_loss + neg_loss) / (num_pos + num_neg)

In [None]:
class Simple_Bilstm(nn.Module):
    def __init__(self, num_features, hidden_size, num_layers, num_entities):
        super(Simple_Bilstm, self).__init__()
        self.BiLstm = nn.LSTM(num_features, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.Output = nn.Linear(2 * hidden_size, num_entities)
    
    def forward(self, x):
        x = self.BiLstm(x)[0]
        x = self.Output(x)
        return x

In [None]:
def accuracy(pred, true):
    # N x seq x class
    # N x seq
    preds = torch.argmax(pred, 2) == true
    return preds.sum() / (true.shape[0] * true.shape[1])

In [None]:
def train(train_loader,
          model,
          criterion,
          optimizer,
          num_epoch,
          val_loader,
          gradient_clip):
    for epoch in range(1, num_epoch + 1):
        train_loss = 0
        val_loss = 0
        val_acc = 0
        for data, true_class in tqdm(train_loader,
                        position=0,
                        leave=True,
                        mininterval=2):
            pred = model(data.to(device))
            optimizer.zero_grad()
            loss = criterion(pred, true_class.to(device))
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), gradient_clip)
            optimizer.step()
            train_loss += loss.item()

        with torch.no_grad():
            for data, true_class in tqdm(val_loader,
                                       position=0,
                                       leave=True,
                                       mininterval=2):
                pred = model(data.to(device))
                val_loss += criterion(pred, true_class.to(device))
                val_acc += accuracy(pred, true_class.to(device)).item()

        print(f'Train Loss on epoch {epoch}: {train_loss / len(train_loader)}')
        print(f'Val Loss on epoch {epoch}: {val_loss / len(val_loader)}')
        print(f'Val Accuracy: {val_acc / len(val_loader)}')

In [None]:
train_set = NER_Train_Dataset(train_list[:50 * 113], w2v, 307, 300)
val_set = NER_Train_Dataset(train_list[50 * 113:], w2v, 307, 300)

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
model = Simple_Bilstm(300, 300, 5, 13).to(device)
criterion = My_Loss(1)
gradient_clip = 5.0
optimizer = torch.optim.Adam(model.parameters(), lr=0.0015)

In [None]:
train(train_set, model, criterion, optimizer, 5, val_set, gradient_clip)

100%|██████████| 119/119 [00:10<00:00, 10.95it/s]
100%|██████████| 98/98 [00:02<00:00, 36.85it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

Train Loss on epoch 1: 0.660460274478718
Val Loss on epoch 1: 0.6986675262451172
Val Accuracy: 0.8516281387027429


100%|██████████| 119/119 [00:10<00:00, 11.21it/s]
100%|██████████| 98/98 [00:02<00:00, 36.52it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

Train Loss on epoch 2: 0.6006453115915676
Val Loss on epoch 2: 0.7267576456069946
Val Accuracy: 0.8516281387027429


100%|██████████| 119/119 [00:10<00:00, 10.95it/s]
100%|██████████| 98/98 [00:02<00:00, 36.29it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

Train Loss on epoch 3: 0.5561891051523873
Val Loss on epoch 3: 0.6692062020301819
Val Accuracy: 0.8516281387027429


100%|██████████| 119/119 [00:10<00:00, 11.10it/s]
100%|██████████| 98/98 [00:02<00:00, 36.97it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

Train Loss on epoch 4: 0.529724503157586
Val Loss on epoch 4: 0.6473711729049683
Val Accuracy: 0.8516281387027429


100%|██████████| 119/119 [00:10<00:00, 11.16it/s]
100%|██████████| 98/98 [00:02<00:00, 36.63it/s]

Train Loss on epoch 5: 0.517044269411087
Val Loss on epoch 5: 0.6338331699371338
Val Accuracy: 0.8516281387027429





Без CRF не очень - все как 'O'

# BERT

Часть кода для предобработки взята из https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

In [None]:
!pip install -qq transformers

[K     |████████████████████████████████| 1.5MB 11.1MB/s 
[K     |████████████████████████████████| 2.9MB 35.1MB/s 
[K     |████████████████████████████████| 890kB 41.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import transformers
from transformers import  BertForTokenClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 75
bs = 32

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
len(tokenizer.tokenize('Once upon a midnight dreary, while I pondered, weak and weary, \Over many a quaint and curious volume of forgotten lore — \While I nodded, nearly napping, suddenly there came a tapping, \As of some one gently rapping, rapping at my chamber door. \'Tis some visitor,\' I muttered, \'tapping at my chamber door— \Only this and nothing more.'))

87

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in train_list
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
len(id_to_name)

13

In [None]:
tags = pad_sequences(labels,
                     maxlen=MAX_LEN, value=len(id_to_name), padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=42, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=42, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
model = BertForTokenClassification.from_pretrained(
    PRE_TRAINED_MODEL_NAME,
    num_labels=len(id_to_name) + 1,
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
]

In [None]:
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
epochs = 3
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
model = model.to(device)

In [None]:
def bert_accuracy(pred, true, pad_lable):
    return (torch.argmax(pred, dim = 2) == true)[true != pad_lable].sum() / (true != pad_lable).sum()

In [None]:
def train(train_loader,
          model,
          optimizer,
          num_epoch,
          val_loader,
          scheduler,
          max_grad_norm,
          pad_lable):
    for epoch in range(1, num_epoch + 1):
        model.train()
        train_loss = 0
        val_loss = 0
        val_acc = 0
        train_acc = 0
        for batch in tqdm(train_loader,
                        position=0,
                        leave=True,
                        mininterval=2):
            pred = model(batch[0].to(device), token_type_ids=None,
                attention_mask=batch[1].to(device), labels=batch[2].to(device))
            optimizer.zero_grad()
            loss = pred[0]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            optimizer.step()
            train_loss += loss.item()
            scheduler.step()
            train_acc += bert_accuracy(pred[1], batch[2].to(device), pad_lable).item()
        model.eval()
        with torch.no_grad():
            for batch in tqdm(val_loader,
                                       position=0,
                                       leave=True,
                                       mininterval=2):
                pred = model(batch[0].to(device), token_type_ids=None,
                            attention_mask=batch[1].to(device), labels=batch[2].to(device))
                val_loss += pred[0]
                val_acc += bert_accuracy(pred[1], batch[2].to(device), pad_lable).item()

        print(f'Train Loss on epoch {epoch}: {train_loss / len(train_loader)}')
        print(f'Val Loss on epoch {epoch}: {val_loss / len(val_loader)}')
        print(f'Val Accuracy: {val_acc / len(val_loader)}')
        print(f'Train Accuracy: {train_acc / len(train_loader)}')

In [None]:
train(train_dataloader, model, optimizer, epochs, valid_dataloader, scheduler, max_grad_norm, len(id_to_name))

100%|██████████| 194/194 [01:25<00:00,  2.26it/s]
100%|██████████| 22/22 [00:03<00:00,  6.73it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Train Loss on epoch 1: 0.36427823250594826
Val Loss on epoch 1: 0.21760499477386475
Val Accuracy: 0.930943405086344
Train Accuracy: 0.8966127188549828


100%|██████████| 194/194 [01:25<00:00,  2.26it/s]
100%|██████████| 22/22 [00:03<00:00,  6.71it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Train Loss on epoch 2: 0.17400003155482183
Val Loss on epoch 2: 0.18087607622146606
Val Accuracy: 0.9423060986128721
Train Accuracy: 0.9459532047669912


100%|██████████| 194/194 [01:25<00:00,  2.27it/s]
100%|██████████| 22/22 [00:03<00:00,  6.72it/s]

Train Loss on epoch 3: 0.11862590832194102
Val Loss on epoch 3: 0.18809418380260468
Val Accuracy: 0.9447241208770059
Train Accuracy: 0.9636923477207262





In [None]:
next(iter(train_dataloader))[0].shape

torch.Size([32, 75])

In [None]:
test_list = []
with open('/content/test_sents_without_answers.csv') as file:
    file.readline()
    cur_sent_list = []
    cur_id_list = []
    for line in file:
        if line == '\n':
            test_list.append([cur_sent_list, cur_id_list])
            cur_sent_list = []
            cur_id_list = []
        else:
            if len(line.split(',')) == 2:
                cur_list = line[:-1].split(',')
                cur_sent_list.append(cur_list[1])
                cur_id_list.append(cur_list[0])
            else:
                cur_sent_list.append(',')
                cur_id_list.append(line.split(',')[0])

In [None]:
test_list[0]

[['PROLOGUE',
  'IT',
  'was',
  '2',
  'p.m.',
  'on',
  'the',
  'afternoon',
  'of',
  'May',
  '7',
  ',',
  '1915',
  '.'],
 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']]

In [None]:
tokenized_texts_and_ids = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in test_list
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_ids]
ids = [token_label_pair[1] for token_label_pair in tokenized_texts_and_ids]

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
test_ids = pad_sequences(ids,
                     maxlen=MAX_LEN, value=-1, padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
input_ids = torch.tensor(input_ids).cuda()
attention_masks = torch.tensor(attention_masks).cuda()

In [None]:
with torch.no_grad():
    output = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
label_indices.shape

(1669, 75)

In [None]:
test_ids.shape

(1669, 75)

In [None]:
answer = pd.DataFrame(np.arange(41365))
answer['tag'] = 'O'
pred_ind = 0
for test_ind_row, answer_row in zip(test_ids, label_indices):
    for ind, ans in zip(test_ind_row, answer_row):
        if ind > -1 and pred_ind != ind:
            answer['tag'][ind] = id_to_name[ans]
        pred_ind = ind

In [None]:
answer.rename(columns={0: 'id'}, inplace=True)

In [None]:
answer.to_csv('answer.csv', index=False)