## Main

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 3.8MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 28.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█

In [4]:
import pandas as pd
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction import DictVectorizer

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
# from tqdm import tqdm_notebook as tqdm

# from transformers import AutoConfig, DistilBertTokenizerFast, DistilBertForTokenClassification
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification

device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
device

'cuda'

In [6]:
data = pd.read_csv("input/inp/main_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,POS,Tag,Arabic
0,0,1,Thousands,NNS,O,الآلاف
1,1,1,of,IN,O,من
2,2,1,demonstrators,NNS,O,المتظاهرين
3,3,1,have,VBP,O,لديك
4,4,1,marched,VBN,O,سار


In [7]:
vocab = {}
data.fillna(method="ffill", inplace=True)
all_words = list(set(data.Arabic.values))
for i, val in enumerate(all_words):
    try:
        vocab[int(val)] = int(val)
    except:
        vocab[val] = i

len(all_words)

27942

In [8]:
data.Tag.value_counts()

O        887824
B-geo     37639
B-tim     20332
B-org     20141
I-per     17250
B-per     16987
I-org     16781
B-gpe     15869
I-geo      7410
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [9]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Arabic"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [10]:
class NerData(Dataset):
    def __init__(self, csv, sentence_ids, words, targets, maxlen=256):
        self.data = csv
        getter = SentenceGetter(self.data[[sentence_ids, words, targets]])
        self.sentences = getter.sentences
        self.maxlen = maxlen if maxlen else max(len(sen) for sen in self.sentences)
        tags = list(set(self.data["Tag"].values))
        self.tags2index = {t:i for i,t in enumerate(tags)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        X = [w[0]for w in self.sentences[idx]]
        text = []
        for i in range(self.maxlen):
            try:
                text.append(vocab[X[i]])
            except:
                text.append(27943)
        # text=[]
        # for i in self.sentences[idx]:
        #     try:
        #         text.append(vocab[i])
        #     except:
        #         text.append(27943)

        target = [[self.tags2index[w[1]] for w in self.sentences[idx]]]
        target = pad_sequences(maxlen=self.maxlen, sequences=target, padding="post", value=self.tags2index["O"])

        return {
            'text':torch.tensor(text, dtype=torch.long),
            'target':torch.tensor(target[0], dtype=torch.long),
        }

In [11]:
label_map = {'O':0, 'B-geo':1, 'B-tim':2, 'B-org':3, 'I-per':4, 'B-per':5, 'I-org':6, 'B-gpe':7, 'I-geo':8, 'I-tim':9, 'B-art':10, 'B-eve':11, 'I-art':12, 'I-eve':13, 'B-nat':14, 'I-gpe':15, 'I-nat':16}
# tokenizer = DistilBertTokenizerFast.from_pretrained()
# tokenizer = AutoTokenizer.from_pretrained("marefa-nlp/marefa-ner")
# encoder = DictVectorizer(sparse=False)

train_data, test_data = train_test_split(
    data,
    test_size=0.3,
    shuffle=False,
    random_state=13
)
train_data, valid_data = train_test_split(
    train_data,
    test_size=0.4,
    shuffle=False,
    random_state=13
)

train_data = NerData(train_data, "Sentence #", "Arabic", "Tag", maxlen=19)
valid_data = NerData(valid_data, "Sentence #", "Arabic", "Tag", maxlen=19)
test_data = NerData(test_data, "Sentence #", "Arabic", "Tag", maxlen=19)

train_loader = DataLoader(train_data, batch_size=8, shuffle=False, num_workers=0)
valid_loader = DataLoader(valid_data, batch_size=8, shuffle=False, num_workers=0)
test_loader = DataLoader(test_data, batch_size=8, shuffle=False, num_workers=0)

In [12]:
next(iter(train_loader))

{'text': tensor([[ 3857,  5496, 14729,   883,  2911,  5256, 21549, 11215,  6084,  3373,
          18250,  2536,  6850, 11438, 22650,  3373, 23202,  5496, 20339],
         [12182,  5496, 22752, 23333,  2536,  3373, 26257,  7130,  3373, 14729,
          21744, 24023, 18103, 22461,  3910,  9142,  3910, 26144, 23636],
         [21673,  2911,  9701,  3373, 13094,  5496, 26055, 11215,  8457, 18470,
           2536, 13978,  6748, 13210, 27943, 27943, 27943, 27943, 27943],
         [ 5110, 12041,  3373, 17482,  5496, 14729,  2536, 27943, 24419,  2140,
          18696, 23617, 19759,  8722, 13210, 27943, 27943, 27943, 27943],
         [ 3373,  6084,  8834, 10903,  3373, 27449,  5496,  3373, 10239, 24066,
           5496,  3357,  3179, 20206,  6109, 20562,  2536,  3373,  1350],
         [ 3373, 20562,  4433, 16147, 10903,  3357,  3179, 17463,  2536,  3373,
           6850, 26257, 11438,  3373,   917, 20040,  5496, 27943, 20339],
         [ 3373, 21549, 16241, 20228,  9163,  5496,  4522,  3219,  3

In [13]:
# model = DistilBertForTokenClassification.from_pretrained(")
# model = AutoModelForTokenClassification.from_pretrained("marefa-nlp/marefa-ner", num_labels=len(label_map))
model = AutoModelForTokenClassification.from_pretrained("marefa-nlp/marefa-ner")
model.save_pretrained("model/")
model = AutoModelForTokenClassification.from_pretrained("model/")
model.to(device)

In [31]:
# model = AutoModelForTokenClassification.from_pretrained("model/", cache_dir="model/", local_files_only=True)

In [None]:
model = model.to(device)

In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [27]:
for epoch in range(10):
    running_loss = 0.0
    for i, data in tqdm(enumerate(train_loader, 0), total=len(train_loader)):
        inputs, labels = data['text'], data['target']
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 2 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2))
            running_loss = 0.0

print('Finished Training')

HBox(children=(FloatProgress(value=0.0, max=55045.0), HTML(value='')))




RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

In [None]:
model.save_pretrained("model/")