In [113]:
import pandas as pd
data = pd.read_csv('ner_dataset.csv', encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [162]:
data.shape

(1048575, 6)

In [114]:
len(data['Word'].to_list())

1048575

In [115]:
len(set(data['Word'].to_list()))

35178

In [116]:
len(data['Tag'].to_list())

1048575

In [117]:
len(set(data['Tag'].to_list()))

17

In [118]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [119]:
tag2idx

{'B-gpe': 0,
 'I-per': 1,
 'I-nat': 2,
 'B-org': 3,
 'B-tim': 4,
 'I-tim': 5,
 'I-geo': 6,
 'B-eve': 7,
 'B-nat': 8,
 'O': 9,
 'B-per': 10,
 'I-gpe': 11,
 'I-eve': 12,
 'I-art': 13,
 'I-org': 14,
 'B-geo': 15,
 'B-art': 16}

In [120]:
idx2tag

{0: 'B-gpe',
 1: 'I-per',
 2: 'I-nat',
 3: 'B-org',
 4: 'B-tim',
 5: 'I-tim',
 6: 'I-geo',
 7: 'B-eve',
 8: 'B-nat',
 9: 'O',
 10: 'B-per',
 11: 'I-gpe',
 12: 'I-eve',
 13: 'I-art',
 14: 'I-org',
 15: 'B-geo',
 16: 'B-art'}

In [121]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head(3)

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,21196,9
1,,of,IN,O,32425,9
2,,demonstrators,NNS,O,7055,9


In [122]:
data_fillna = data.fillna(method='ffill', axis=0)
data_fillna.head(3)

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,21196,9
1,Sentence: 1,of,IN,O,32425,9
2,Sentence: 1,demonstrators,NNS,O,7055,9


In [123]:
data_group = data_fillna.groupby(['Sentence #']).agg(lambda x: list(x)).reset_index()
data_group.head(3)
# here we are getting 1, 10, 100 after groupby due to heavy dataset

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[21196, 32425, 7055, 7019, 8052, 14826, 7747, ...","[9, 9, 9, 9, 9, 9, 15, 9, 9, 9, 9, 9, 15, 9, 9..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[8492, 32619, 5249, 23817, 19106, 1920, 30408,...","[0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[26795, 12980, 23186, 23593, 24753, 8239, 3241...","[9, 9, 4, 9, 9, 9, 9, 9, 15, 9, 9, 9, 9, 9, 3,..."


In [124]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [125]:
n_token = len(list(set(data['Word'].to_list())))
n_tag = len(list(set(data['Tag'].to_list())))
print(n_token)
print(n_tag)

35178
17


In [126]:
idx2token[35177]

'merchant'

In [127]:
tokens = data_group['Word_idx'].to_list()
maxlen = max([len(s) for s in tokens])
# value is the value for padding default is 0
pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token-1)

In [128]:
pad_tokens

array([[21196, 32425,  7055, ..., 35177, 35177, 35177],
       [ 8492, 32619,  5249, ..., 35177, 35177, 35177],
       [26795, 12980, 23186, ..., 35177, 35177, 35177],
       ...,
       [27727, 33778,  5287, ..., 35177, 35177, 35177],
       [14646,  6232, 34615, ..., 35177, 35177, 35177],
       [ 7567, 20295, 27603, ..., 35177, 35177, 35177]])

In [129]:
pad_tokens.shape

(47959, 104)

In [130]:
tags = data_group['Tag_idx'].tolist()
pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2idx["O"])
pad_tags

array([[ 9,  9,  9, ...,  9,  9,  9],
       [ 0,  9,  9, ...,  9,  9,  9],
       [ 9,  9,  4, ...,  9,  9,  9],
       ...,
       [ 9, 15,  9, ...,  9,  9,  9],
       [ 9,  9,  9, ...,  9,  9,  9],
       [ 9,  3, 14, ...,  9,  9,  9]])

In [131]:
pad_tags.shape

(47959, 104)

In [132]:
n_tags = len(tag2idx)
n_tags

17

In [133]:
import numpy as np
pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
np.array(pad_tags).shape

(47959, 104, 17)

In [134]:
pad_tags[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [138]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token-1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    pad_tags = [to_categorical(i, num_classes=n_tag) for i in pad_tags]
    
    #Split train, test and validation set
    X_train, X_test, y_train, y_test = train_test_split(pad_tokens, pad_tags, test_size=0.1, random_state=2020)

    return X_train, X_test, np.array(y_train), np.array(y_test)

X_train, X_test, y_train, y_test = get_pad_train_test_val(data_group, data)


In [139]:
X_train.shape

(43163, 104)

In [140]:
y_train.shape

(43163, 104, 17)

In [141]:
input_dim = len(list(set(data['Word'].to_list())))+1 # vocsize
output_dim = 64 # no of features required to represent each word
# input_length = max([len(s) for s in data_group['Word_idx'].tolist()]) or
input_length = X_train.shape[1]
# input_length is no of elements in 2D array of first 1D array
n_tags = len(tag2idx)

In [156]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
model = Sequential()

# Add Embedding layer
# output_dim is no of vectors in each word 
# input_dim is vocab_size
# in put length is X_train.shape[1]
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

# Add bidirectional LSTM
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2)))
# model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'))

# Add LSTM
model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5))

# Add timeDistributed Layer
model.add(TimeDistributed(Dense(n_tags, activation="relu")))
# model.add(Dense(n_tags, activation="relu"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 104, 64)           2251456   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 104, 128)          66048     
_________________________________________________________________
lstm_18 (LSTM)               (None, 104, 64)           49408     
_________________________________________________________________
time_distributed_5 (TimeDist (None, 104, 17)           1105      
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________


In [157]:
y_train.shape

(43163, 104, 17)

In [143]:
model.fit(X_train, y_train, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)



<tensorflow.python.keras.callbacks.History at 0x20e15224e80>

In [161]:
model.predict(X_test).shape

(4796, 104, 17)

In [29]:
df = pd.DataFrame({"ID":["A","B","A","C","A","A","C","B"], "value":[1,2,4,3,6,7,3,4]})

In [30]:
df

Unnamed: 0,ID,value
0,A,1
1,B,2
2,A,4
3,C,3
4,A,6
5,A,7
6,C,3
7,B,4


In [33]:
df.groupby("ID").sum()

Unnamed: 0_level_0,value
ID,Unnamed: 1_level_1
A,18
B,6
C,6


In [31]:
df_group1 = df.groupby("ID").sum().reset_index()
df_group1

Unnamed: 0,ID,value
0,A,18
1,B,6
2,C,6


In [37]:
df_group2 = df.groupby("ID").sum()
df_group2

Unnamed: 0_level_0,value
ID,Unnamed: 1_level_1
A,18
B,6
C,6


In [44]:
df_group2 = df.groupby("ID", as_index=False)
for k, v in df_group2:
    print(k)
    print(v)
    print()

A
  ID  value
0  A      1
2  A      4
4  A      6
5  A      7

B
  ID  value
1  B      2
7  B      4

C
  ID  value
3  C      3
6  C      3

