In [2]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from keras.utils import to_categorical

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [4]:
df = pd.read_excel('annoted NER bangla (1) (1).xlsx')
df.head(30)

Unnamed: 0,Patient #,word,label
0,patient:1,আমার,O
1,,তিন,O
2,,দিন,O
3,,যাবত,O
4,,অনেক,O
5,,মাথা,medicine
6,,ব্যথা,O
7,patient:2,আমার,O
8,,প্রায়,O
9,,এক,O


In [5]:
df=df.dropna(subset=['label'])
df = df.fillna(method = 'ffill')
df['label'].unique()

array(['O', 'medicine', 'dermatology', 'surgery', 'dentist',
       'cardiologists', 'rheumatologist', 'ophthalmology', 'gynae'],
      dtype=object)

In [6]:
df = df.applymap(str)

In [7]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w,t) for w, t in zip(s['word'].values.tolist(),
                                                       s['label'].values.tolist())]
        self.grouped = self.df.groupby("Patient #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['patient:{}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [8]:
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[2]

'আমার  বাবার হাত এর কনুই তে অনেক ব্যথা করে আবার অনেক সময় বাবা ঠিক বুঝতে পারেন না যে হাত বা পা এর ঠিক কোথায় ব্যথা করছে'

In [9]:
sentences = getter.sentences

In [10]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    
    }
    if i > 0:
        word1 = sent[i-1][0]
        
        features.update({
            '-1:word.istitle()': word1.istitle(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
       
        features.update({
            
            '+1:word.istitle()': word1.istitle(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [11]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
z = [sent2tokens(s) for s in sentences]
print(z)

[['আমার ', 'তিন', 'দিন', 'যাবত', 'অনেক', 'মাথা', 'ব্যথা'], ['আমার ', 'হাটু', 'তে', 'অনেক', 'ব্যথা', 'করে'], ['আমার ', 'বাবার', 'হাত', 'এর', 'কনুই', 'তে', 'অনেক', 'ব্যথা', 'করে', 'আবার', 'অনেক', 'সময়', 'বাবা', 'ঠিক', 'বুঝতে', 'পারেন', 'না', 'যে', 'হাত', 'বা', 'পা', 'এর', 'ঠিক', 'কোথায়', 'ব্যথা', 'করছে'], ['আমার ', 'প্রায়', 'তিন ', 'সপ্তাহ ', 'ধরে', 'খুশখুশে ', 'কাঁশি'], ['আমার ', 'প্রায় ', 'সারা ', 'বছর', 'ই', 'সর্দি', 'লেগে', 'থাকে', 'কোনো', 'খাবার ', 'এর', 'স্বাদ ', 'আর ', 'গন্ধ ', 'পাই ', 'না'], ['আমার ', 'শরীর', 'অনেক ', 'দুর্বল ', 'লাগে', 'ঠিকমত ', 'ঘুম ', 'হয় ', 'না'], ['আমার ', 'মাথা ', 'ঘুরায় ', 'আর ', 'বমি', 'ভাব ', 'হয়'], ['আমি ', 'দূর ', 'এর', 'জিনিস ', 'দেখতে', 'পারি', 'না', 'দূর ', 'থেকে ', 'লিখা ', 'সব ', 'ঝাপ্সা ', 'দেখি'], ['আমি ', 'রাত', 'এর', 'বেলায়', 'চোখ', 'এ', 'কম ', 'দেখি', 'আমি ', 'ধুলায় ', 'গেলে ', 'আমার', 'স্কিন ', 'অনেক ', 'চুল্কায় ', 'আর ', 'ফুলে ', 'লাল ', 'হয়ে ', 'যায়'], ['আমার', 'স্কিন ', 'অনেক ', 'চুল্কায় ', 'আর', 'ধরলে', 'ব্যথা ', 'করে'], ['আমার ', 'তিন '

In [12]:
voc_size= 10000

In [13]:
onehot_repr=[one_hot(words,voc_size) for words in df['word']]
onehot_repr

[[2718],
 [1487],
 [4574],
 [3845],
 [6994],
 [5734],
 [5757],
 [2718],
 [3513],
 [6088],
 [4701],
 [5769],
 [6177],
 [7894],
 [8623],
 [5734],
 [5769],
 [3285],
 [3389],
 [6994],
 [8889],
 [901],
 [1892],
 [2718],
 [5772],
 [7894],
 [1032],
 [6308],
 [343],
 [9387],
 [791],
 [2718],
 [6994],
 [5341],
 [1204],
 [1313],
 [7894],
 [3389],
 [6994],
 [5757],
 [8112],
 [2718],
 [3513],
 [6800],
 [4574],
 [5769],
 [6994],
 [2459],
 [5757],
 [2718],
 [5239],
 [9830],
 [3389],
 [4478],
 [6308],
 [9044],
 [6282],
 [8074],
 [2718],
 [3780],
 [2998],
 [8112],
 [994],
 [200],
 [6504],
 [9462],
 [1973],
 [6129],
 [3563],
 [6994],
 [901],
 [1892],
 [2718],
 [3780],
 [7474],
 [4570],
 [2718],
 [6002],
 [6994],
 [1143],
 [4624],
 [8112],
 [7191],
 [1204],
 [6994],
 [6723],
 [9462],
 [524],
 [1825],
 [6002],
 [7894],
 [2718],
 [2704],
 [9135],
 [8644],
 [6002],
 [5757],
 [8112],
 [2718],
 [6671],
 [9811],
 [6994],
 [5757],
 [8112],
 [2718],
 [759],
 [9561],
 [6308],
 [2954],
 [9811],
 [6994],
 [5757],


In [14]:
len(onehot_repr)

1035

In [15]:
sent_length = 9
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...    0    0 2718]
 [   0    0    0 ...    0    0 1487]
 [   0    0    0 ...    0    0 4574]
 ...
 [   0    0    0 ...    0    0 5757]
 [   0    0    0 ...    0    0 2043]
 [   0    0    0 ...    0    0  791]]


In [16]:
from tensorflow.keras.layers import Attention, Masking
from keras_self_attention import SeqSelfAttention
import tensorflow as tf

In [17]:
embedding_vector_features=20
model= Sequential()
model.add(Embedding(500, embedding_vector_features, input_length=9))
model.add(Bidirectional(LSTM(50)))



model.add(Dense(9, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 9, 20)             10000     
_________________________________________________________________
bidirectional (Bidirectional (None, 100)               28400     
_________________________________________________________________
dense (Dense)                (None, 9)                 909       
Total params: 39,309
Trainable params: 39,309
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['label'])


In [19]:
one_hot_label = to_categorical(y)

In [20]:
import numpy as np
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [21]:
X_final.shape,one_hot_label.shape

((1035, 9), (1035, 9))

In [22]:
from sklearn.model_selection import train_test_split
X_in, X_test, y_in, y_test = train_test_split(X_final, one_hot_label, test_size=0.15, random_state=10)

In [23]:
model.fit(X_in, y_in,epochs=100, batch_size=20)

Epoch 1/100


InvalidArgumentError:  indices[0,8] = 2459 is not in [0, 500)
	 [[node sequential/embedding/embedding_lookup (defined at <ipython-input-23-d6f6688ea3e8>:1) ]] [Op:__inference_train_function_5605]

Errors may have originated from an input operation.
Input Source operations connected to node sequential/embedding/embedding_lookup:
 sequential/embedding/embedding_lookup/3452 (defined at C:\Users\tanvi\anaconda3\lib\contextlib.py:113)

Function call stack:
train_function
