In [3]:
import pandas as pd
import numpy as np

import joblib
import torch

from sklearn import preprocessing
from sklearn import model_selection

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import training_params
import torch.nn as nn
from tqdm import tqdm

In [100]:
df = pd.read_csv('../input/train.csv', encoding="utf-8")

enc_label = preprocessing.LabelEncoder()
df.loc[:, "label"] = enc_label.fit_transform(df["label"])

sentences = df.groupby("sentence")["word"].apply(list).values
labels = df.groupby("sentence")["label"].apply(list).values

In [101]:
df.head()

Unnamed: 0,sentence,word,label
0,1,म्यूरियल,2
1,1,अब,2
2,1,बीस,2
3,1,साल,2
4,1,की,2


In [102]:
sentences

array([list(['म्यूरियल', 'अब', 'बीस', 'साल', 'की', 'हो', 'गई', 'है']),
       list(['म्यूरियल', 'अब', 'बीस', 'साल', 'की', 'है']),
       list(['मैं', 'इस', 'दुनिया', 'में', 'शिक्षा', 'पर', 'बहुत', 'निराश', 'हूँ']),
       list(['वैसा', 'नही', 'होगा']),
       list(['मुझें', 'तुम्हारी', 'याद', 'आ', 'रही', 'है']),
       list(['तुम्हें', 'सोना', 'चाहिए']),
       list(['आपको', 'सोना', 'चाहिए']),
       list(['मुझे', 'जीव', 'विज्ञान', 'कभी', 'भी', 'पसंद', 'नहीं', 'था']),
       list(['मैं', 'नहीं', 'हूँ', 'तुम', 'हो'])], dtype=object)

In [103]:
labels

array([list([2, 2, 2, 2, 2, 2, 2, 4]), list([2, 2, 2, 2, 2, 4]),
       list([3, 2, 2, 2, 2, 2, 2, 2, 4]), list([3, 2, 4]),
       list([2, 2, 2, 2, 2, 4]), list([3, 2, 4]), list([3, 2, 4]),
       list([3, 2, 2, 2, 2, 2, 2, 4]), list([3, 2, 0, 2, 1])],
      dtype=object)

In [30]:
num_labels = len(list(enc_label.classes_))
num_labels

5

In [31]:
   (
        train_sentences,
        test_sentences,
        train_labels,
        test_labels,
    ) = model_selection.train_test_split(sentences, labels, random_state=42, test_size=0.1)

In [91]:
train_sentences

array([list(['म्यूरियल', 'अब', 'बीस', 'साल', 'की', 'है']),
       list(['तुम्हें', 'सोना', 'चाहिए']),
       list(['म्यूरियल', 'अब', 'बीस', 'साल', 'की', 'हो', 'गई', 'है']),
       list(['मैं', 'नहीं', 'हूँ', 'तुम', 'हो']),
       list(['मैं', 'इस', 'दुनिया', 'में', 'शिक्षा', 'पर', 'बहुत', 'निराश', 'हूँ']),
       list(['मुझें', 'तुम्हारी', 'याद', 'आ', 'रही', 'है']),
       list(['वैसा', 'नही', 'होगा']), list(['आपको', 'सोना', 'चाहिए'])],
      dtype=object)

In [33]:
test_sentences

array([list(['मुझे', 'जीव', 'विज्ञान', 'कभी', 'भी', 'पसंद', 'नहीं', 'था'])],
      dtype=object)

In [93]:
train_labels

array([list([2, 2, 2, 2, 2, 4]), list([3, 2, 4]),
       list([2, 2, 2, 2, 2, 2, 2, 4]), list([3, 2, 0, 2, 1]),
       list([3, 2, 2, 2, 2, 2, 2, 2, 4]), list([2, 2, 2, 2, 2, 4]),
       list([3, 2, 4]), list([3, 2, 4])], dtype=object)

In [35]:
test_labels

array([list([3, 2, 2, 2, 2, 2, 2, 4])], dtype=object)

In [60]:
class PunctuationDataset:
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        labels = self.labels[item]

        ids = []
        target_labels = []

        for i, s in enumerate(text):
            inputs = training_params.TOKENIZER.encode(
                s,
                add_special_tokens=False
            )

            input_len = len(inputs)
            ids.extend(inputs)
            target_labels.extend([labels[i]] * input_len)

        ids = ids[:training_params.MAX_LEN - 2]
        target_labels = target_labels[:training_params.MAX_LEN - 2]

        ids = [101] + ids + [102]
        target_labels = [0] + target_labels + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = training_params.MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_labels = target_labels + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_labels, dtype=torch.long),
        }

In [61]:
train_dataset = PunctuationDataset(texts=train_sentences, labels=train_labels)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=training_params.TRAIN_BATCH_SIZE,
                                                num_workers=4)

valid_dataset = PunctuationDataset(texts=test_sentences, labels=test_labels)

valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=training_params.VALID_BATCH_SIZE,
                                                num_workers=1)

In [62]:
def loss_function(output, target, mask, num_labels):
    cross_entropy_loss = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(cross_entropy_loss.ignore_index).type_as(target)
    )
    loss = cross_entropy_loss(active_logits, active_labels)
    return loss


class PunctuationModel(nn.Module):
    def __init__(self, num_tag):
        super(PunctuationModel, self).__init__()
        self.num_tag = num_tag
        self.bert = training_params.MODEL
        self.bert_drop = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)

    def forward(self, ids, mask, token_type_ids, target_tag):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)

        bo_tag = self.bert_drop(o1)

        tag = self.out_tag(bo_tag)

        loss_tag = loss_function(tag, target_tag, mask, self.num_tag)

        loss = loss_tag
        return tag, loss

In [63]:
device = 'cpu'
model = PunctuationModel(num_tag=num_labels)
model.to(device)

PunctuationModel(
  (bert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
    

In [64]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

num_train_steps = int(len(train_sentences) / training_params.TRAIN_BATCH_SIZE * training_params.EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

In [65]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [66]:
eg = next(iter(train_data_loader))

In [67]:
eg.items()

dict_items([('ids', tensor([[  101, 30299,   367, 28689,   311,  1209, 26448,  1883,  4384,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,   

In [68]:
for k, v in eg.items():
    eg[k] = v.to(device)
    optimizer.zero_grad()
    _, loss = model(**eg)

In [70]:
a, b = model(**eg)

In [72]:
a.shape

torch.Size([1, 128, 5])

In [5]:
from transformers import AlbertTokenizer, AlbertForTokenClassification

In [6]:
tokenizer = AlbertTokenizer.from_pretrained('ai4bharat/indic-bert')

In [10]:
model = AlbertForTokenClassification.from_pretrained('ai4bharat/indic-bert',
                                                   num_labels=5,
                                                   output_attentions=False,
                                                   output_hidden_states=False)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForTokenClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'sop_classifier.classifier.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and a

In [12]:
model

AlbertForTokenClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, b

In [81]:
inputs = tokenizer("आपका स्वागत हैं", return_tensors="pt")
labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)

In [82]:
labels

tensor([[1, 1, 1, 1, 1, 1]])

In [8]:
tokenizer.tokenize("आपका स्वागत हैं")

['▁आपका', '▁स', 'वागत', '▁हैं']

In [83]:
outputs = model(**inputs, labels=labels)

In [84]:
outputs

TokenClassifierOutput(loss=tensor(1.5914, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.0390,  0.0911, -0.0054, -0.0415,  0.0544],
         [ 0.0627, -0.0181, -0.3403,  0.0879,  0.2391],
         [-0.0307, -0.0847, -0.0485, -0.0288,  0.0319],
         [-0.1493,  0.0654,  0.0814,  0.1902, -0.0588],
         [-0.2060, -0.0092, -0.0335,  0.1145, -0.1751],
         [ 0.0390,  0.0911, -0.0054, -0.0415,  0.0544]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [85]:
loss = outputs.loss

In [86]:
logits = outputs.logits

In [87]:
logits

tensor([[[ 0.0390,  0.0911, -0.0054, -0.0415,  0.0544],
         [ 0.0627, -0.0181, -0.3403,  0.0879,  0.2391],
         [-0.0307, -0.0847, -0.0485, -0.0288,  0.0319],
         [-0.1493,  0.0654,  0.0814,  0.1902, -0.0588],
         [-0.2060, -0.0092, -0.0335,  0.1145, -0.1751],
         [ 0.0390,  0.0911, -0.0054, -0.0415,  0.0544]]],
       grad_fn=<AddBackward0>)

In [89]:
outputs[1].shape

torch.Size([1, 6, 5])