In [1]:
from gensim.models import FastText
import pandas as pd

data = pd.read_csv('ner_datasetreference.csv',encoding = 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [2]:
data.count()

Sentence #      47959
Word          1048565
POS           1048575
Tag           1048575
dtype: int64

In [3]:
print('number of tags: {}'.format(len(data.Tag.unique())))
freq = data.Tag.value_counts()
freq

number of tags: 17


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [4]:
from sklearn.preprocessing import LabelEncoder
data = data.fillna(method = 'ffill')
data['Sentence #'] = LabelEncoder().fit_transform(data['Sentence #'])
data.head()

  data = data.fillna(method = 'ffill')


Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O


In [5]:
data['Tag'] = data['Tag'].str.upper()
data['Word'] = data['Word'].str.lower()
X = data[['Sentence #','Word']]
y = data['Tag']

In [6]:
y = LabelEncoder().fit_transform(y)

In [7]:
y

array([16, 16, 16, ..., 16, 16, 16])

In [8]:
X = [[] for _ in data['Sentence #'].unique()]
for sen,word in data[['Sentence #','Word']].iloc:
    X[sen].append(word)


In [9]:
X[0]

['thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'london',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'british',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [10]:
model_ft = FastText(X)

In [11]:
model_ft.wv.most_similar(positive=['today'], topn=20)

[('34-day', 0.9737409949302673),
 ('11-day', 0.9718878269195557),
 ('10-day', 0.9690268635749817),
 ('monday', 0.9658756852149963),
 ('sunday', 0.958530604839325),
 ('wednesday', 0.956044614315033),
 ('friday', 0.953147292137146),
 ('saturday', 0.9474976658821106),
 ('thursday', 0.9452310800552368),
 ('one-day', 0.940643846988678),
 ('tuesday', 0.9387921690940857),
 ('six-day', 0.936676025390625),
 ('midday', 0.9352594017982483),
 ('two-day', 0.9267303347587585),
 ('four-day', 0.9236775040626526),
 ('day', 0.9192639589309692),
 ('five-day', 0.9171529412269592),
 ('holiday', 0.9160709977149963),
 ('yesterday', 0.8945143222808838),
 ('birthday', 0.871319055557251)]

In [12]:
adfs = {'hue': 1,'sue':12}
all_words = sorted(list(adfs.keys()))
adfs[all_words[1]]

12

In [13]:
data

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
...,...,...,...,...
1048570,42177,they,PRP,O
1048571,42177,responded,VBD,O
1048572,42177,to,TO,O
1048573,42177,the,DT,O


In [14]:
data.groupby('Sentence #').agg({'Word': list,'Tag': list})

Unnamed: 0_level_0,Word,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,"[iranian, officials, say, they, expect, to, ge...","[B-GPE, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,"[helicopter, gunships, saturday, pounded, mili...","[O, O, B-TIM, O, O, O, O, O, B-GEO, O, O, O, O..."
3,"[they, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[u.n., relief, coordinator, jan, egeland, said...","[B-GEO, O, O, B-PER, I-PER, O, B-TIM, O, B-GEO..."
...,...,...
47954,"[opposition, leader, mir, hossein, mousavi, ha...","[O, O, O, B-PER, I-PER, O, O, O, O, O, O, O, O..."
47955,"[on, thursday, ,, iranian, state, media, publi...","[O, B-TIM, O, B-GPE, O, O, O, O, O, O, O, O, B..."
47956,"[following, iran, 's, disputed, june, 12, elec...","[O, B-GEO, O, O, B-TIM, I-TIM, O, O, O, O, O, ..."
47957,"[since, then, ,, authorities, have, held, publ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [186]:
#data preprocesing
import numpy as np

class Dataset():
    def __init__(self,dataset:pd.DataFrame,model):
        self.batch = 32

        self.sentences = dataset['Sentence #'].unique()
        self.words = data.groupby('Sentence #')['Word'].agg(list)
        self.labels = data.groupby('Sentence #')['Tag'].agg(list)
        self.word_vectors = model.wv
        self.vector_size = self.word_vectors[0].shape[0] 

        self.classes = dataset['Tag'].unique()
        self.reclass = dict(zip(self.classes,range(1,self.classes.shape[0]+1)))
        self.max_words_cnt = np.max(self.words.apply(lambda x:len(x)))
        self.model = model
    def __getitem__(self,idx):
        data_train = np.zeros((self.batch,self.max_words_cnt,self.vector_size),dtype=np.float32)
        mask = np.zeros((self.batch,self.max_words_cnt),dtype=bool)
        data_label = np.zeros((self.batch,self.max_words_cnt))

        rg = range(self.batch*idx,self.batch*(idx+1))
    



        word2vec = self.words[rg].apply(lambda x:self.model.wv[x]).to_numpy()
 
        # print(len(self.words[33]))
        
        for i,word_cnt in enumerate(self.words[rg].apply(lambda x:len(x)).to_numpy()):
            data_train[i,0:word_cnt,:self.vector_size] = word2vec[i]
            


            mask[i,0:word_cnt] = True
            
            # print(word_cnt)
            # print(len(self.labels[i]),len(self.words[i]))

            data_label[i,0:word_cnt] = list(map(lambda x: self.reclass[x],self.labels[i+self.batch*idx])) 
            
            # for ii,jj in enumerate(smth):
            #     data_label[i,ii,jj] = 1.0

            
        
        return data_train,mask,data_label
    def __len__(self):
        return self.sentences.shape[0]//self.batch



        


dataset = Dataset(data,model_ft)

In [166]:
dataset.labels[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GEO',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GEO',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GPE',
 'O',
 'O',
 'O',
 'O',
 'O']

In [187]:
dataset[0][2][0,0]

1.0

In [169]:
dataset.classes.shape[0]

17

In [188]:
import torch
from torch import nn
from torch import optim

class NERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        output, _ = self.lstm(x)
        output = self.fc(output)
        return output
    

EMBEDDING_DIM = 100
HIDDEN_DIM = 100
LEARNING_RATE = 0.1
EPOCHS = 10

model = NERModel(EMBEDDING_DIM, HIDDEN_DIM, dataset.classes.shape[0]+1)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_size = int(len(dataset)*0.8)

In [19]:
train_size

1198

In [191]:
for epoch in range(EPOCHS):
    train_loss = 0.0
    val_loss = 0.0
    model.train()
    for i in range(train_size):

        x,mask,y = dataset[i]  
        
        x = torch.tensor(x,dtype=torch.float32)

        y = torch.tensor(y,dtype=torch.long)    
        optimizer.zero_grad()
        output = model(x)

        loss = criterion(output.view(-1,dataset.classes.shape[0]+1), y.view(-1))
        loss.backward()
        # print(loss.item())
        optimizer.step()
        train_loss += loss.item()
        # print(output[0])
    #     print(output[0].argmax(1))
    #     break
    # break
    train_loss /= train_size
    model.eval()

    with torch.no_grad():
        for i in range(train_size,len(dataset)):
            x,mask, y = dataset[i]
            
            x = torch.tensor(x)
            y = torch.tensor(y,dtype=torch.long)    
            output = model(x)
            loss = criterion(output.view(-1,dataset.classes.shape[0]+1), y.view(-1))
            val_loss += loss.item()
        val_loss /= len(dataset)-train_size

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Epoch 1, Train Loss: 0.0599, Val Loss: 0.0489
Epoch 2, Train Loss: 0.0471, Val Loss: 0.0433
Epoch 3, Train Loss: 0.0427, Val Loss: 0.0406
Epoch 4, Train Loss: 0.0401, Val Loss: 0.0389
Epoch 5, Train Loss: 0.0381, Val Loss: 0.0377
Epoch 6, Train Loss: 0.0367, Val Loss: 0.0369
Epoch 7, Train Loss: 0.0354, Val Loss: 0.0364
Epoch 8, Train Loss: 0.0344, Val Loss: 0.0359
Epoch 9, Train Loss: 0.0335, Val Loss: 0.0357
Epoch 10, Train Loss: 0.0327, Val Loss: 0.0355


In [194]:
acc = 0
for i in range(train_size,len(dataset)):
    x,mask, y = dataset[i]
    
    x = torch.tensor(x)
    y = torch.tensor(y,dtype=torch.long)    
    output = model(x)
    acc += torch.sum(torch.argmax(output,2) == y)/(y.shape[0]*y.shape[1])
    val_loss += loss.item()
acc / (len(dataset)-train_size)

tensor(0.9890)