In [7]:
import codecs
import numpy as np
import pandas as pd
 

Reading the corpus

In [8]:
f = codecs.open("corpus.txt", "r", encoding="utf8") 

raw = f.readlines()

f.close()

Lines is the list that appends list of sentences with each word in the form of a tuple with its corresponding POS tag

In [9]:
lines = []  
l = []

for w in raw:
    w = w.encode('utf-8')    #Splitting list of words and decoding from UTF-8
    x = w.strip(b'\n').split(b' ')
    if x[0] == b'*' or x[0] == b'.' or len(x) < 2:
        if len(l) > 0:
            lines.append(l)
        l = []
        continue
    l.append((x[0], x[1]))


Sentences is a list of list of sentences split into words and Labels is a list of list of labels.

In [10]:
sentences = []
labels = []
num_words = 0

for l in lines:
    s = []
    lab = []
    for w in l:
        x = w[0].decode('utf-8')
        y = w[1].decode('utf-8')
        s.append(x)
        num_words += 1
        lab.append(y)
    labels.append(lab)
    sentences.append(s)

Create a Word2vec model and build vocabulary from sentences

In [11]:
from gensim.models import Word2Vec   
min_count = 1
window = 5

model = Word2Vec(window=window, min_count=min_count)
model.build_vocab(sentences)

Train the Model

In [13]:
import warnings  
warnings.filterwarnings(action='ignore',category=Warning,module='gensim')   
for i in range(1000):
    model.train(sentences, total_words=num_words, epochs=1)    
    if i%100 == 0:
        print("Iteration ",i)

Iteration  0
Iteration  100
Iteration  200
Iteration  300
Iteration  400
Iteration  500
Iteration  600
Iteration  700
Iteration  800
Iteration  900


Feature extraction

In [14]:
X = []
y = []
for idx,s in enumerate(sentences):          
    x = []
    yy = []
    for wi,w in enumerate(s):
        if w in model.wv:
            m = model.wv[w]
            if wi > 1:
                m = np.hstack([m,model.wv[s[wi-1]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            
            if wi > 2:
                m = np.hstack([m,model.wv[s[wi-2]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            
            if wi > 3:
              m = np.hstack([m,model.wv[s[wi-3]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            
            if wi < len(s)-1:
                m = np.hstack([m,model.wv[s[wi+1]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            
            if wi < len(s)-2:
                m = np.hstack([m,model.wv[s[wi+2]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            
            if wi < len(s)-3:
                m = np.hstack([m,model.wv[s[wi+3]]])
            else:
                m = np.hstack([m,np.zeros_like(model.wv[w])])
            x.append(m)
    offset = 100 - len(x)
    pad_x = np.zeros_like(model.wv[w]) - 1
    pad_x = np.hstack([pad_x,pad_x,pad_x,pad_x,pad_x, pad_x, pad_x])
    yy = yy + labels[idx]
    for i in range(offset):
        x.append(pad_x)
        yy.append("IRR")
    X.append(x)
    y.append(yy)

X contains an array of vector embeddings and y is an array of POS tags

In [15]:
X = np.array(X)
y = np.array(y)
np.unique(y)

array(['CC', 'DEM', 'DET', 'INJ', 'IRR', 'JJ', 'NN', 'NUM', 'PRP', 'PSP',
       'QC', 'RB', 'SYM', 'UT', 'VM', 'WQ'], dtype='<U3')

Encoding target variables(POS tags)

In [16]:
original_shape = y.shape
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()               
y = le.fit_transform(y.ravel()).reshape(original_shape)
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

Saving features

In [17]:
f = open("kannada-features.numpy", "wb")
np.save(f, X)                             
f.close()

Saving labels

In [18]:
f = open("kannada-labels.numpy", "wb")
np.save(f,y)                              
f.close()

Implementing the Conditional Random Field(CRF) Model

In [21]:
X = np.load("kannada-features.numpy").astype(np.float32)
Y = np.load("kannada-labels.numpy").astype(np.int32)

np.random.seed(546)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)


X = X[indices]
Y = Y[indices]


Obtaining the count of occurences of each word

In [22]:
num_examples, num_words, num_features = X.shape
num_tags = np.unique(Y).size

sequence_lengths = np.full(num_examples, 0, dtype=np.int32)
for idx, row in enumerate(X):
    count = 0
    for word in row:
        #print(word)
        if np.all(word == -1):
            break
        count += 1
    sequence_lengths[idx] = count

Splitting the corpus for training and testing

In [23]:
split = 100
x_test = X[-split:,:,:]
y_test = Y[-split:,:]
s_test = sequence_lengths[-split:]

x = X[0:-split,:,:]
y = Y[0:-split,:]
sequence_lengths = sequence_lengths[0:-split]

In [25]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [26]:
pip install tensorflow_addons

Note: you may need to restart the kernel to use updated packages.


In [27]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow_addons as tfa

Building the model and printing the accuracies

In [28]:
with tf.Graph().as_default():
    with tf.compat.v1.Session() as session:
        x_t = tf.constant(x)
        xt_t = tf.constant(x_test)
        y_t = tf.constant(y)
        yt_t = tf.constant(y_test)
        sequence_lengths_t = tf.constant(sequence_lengths)
        st_t = tf.constant(s_test)
        
        weights = tf.compat.v1.get_variable("weights", [num_features, num_tags])
        matricized_x_t = tf.reshape(x_t, [-1, num_features])
        matricized_unary_scores = tf.matmul(matricized_x_t, weights)
        unary_scores = tf.reshape(matricized_unary_scores, [num_examples-split, num_words, num_tags])
        
        matricized_xt_t = tf.reshape(xt_t, [-1, num_features])
        matricized_ust = tf.matmul(matricized_xt_t, weights)
        ust = tf.reshape(matricized_ust, [split, num_words, num_tags])
        
        log_likelihood, transition_params = tfa.text.crf_log_likelihood(unary_scores, y_t, sequence_lengths_t)
        
        loss = tf.reduce_mean(-log_likelihood)
        train_op = tf.train.GradientDescentOptimizer(0.03).minimize(loss)
        
        session.run(tf.global_variables_initializer())
        for i in range(100):
            tf_ust, tf_unary_scores, tf_transition_params, _ = session.run([ust, unary_scores, transition_params, train_op])
            if i%5 == 0:
                correct_labels = 0
                total_labels = 0
                for tf_unary_scores_, y_, sequence_length_ in zip(tf_unary_scores, y, sequence_lengths):
                    tf_unary_scores_ = tf_unary_scores_[:sequence_length_]
                    y_ = y_[:sequence_length_]
                    
                    viterbi_sequence, _ = tfa.text.viterbi_decode(tf_unary_scores_, tf_transition_params)
                    
                    correct_labels += np.sum(np.equal(viterbi_sequence, y_))
                    total_labels += sequence_length_
                accuracy = 100.0 * correct_labels / float(total_labels)
                print("Classification Accuracy (Training set): ", accuracy)
        correct_labels = 0
        total_labels = 0
        pred_labels = []
        actual_labels = []
        for a, b, c in zip(tf_ust, y_test, s_test):
            a = a[:c]
            b = b[:c]
            
            vs, _ = tfa.text.viterbi_decode(a, tf_transition_params)
            correct_labels += np.sum(np.equal(vs, b))
            total_labels += c
            
            actual_labels = actual_labels + b.tolist()
            pred_labels = pred_labels + vs
            
        accuracy = 100.0 * correct_labels / float(total_labels)
        print("-------------------------------------------------")
        print("Classification Accuracy (Test set): ", accuracy)

Classification Accuracy (Training set):  6.248801993482845
Classification Accuracy (Training set):  51.75388154111558
Classification Accuracy (Training set):  61.74046386812344
Classification Accuracy (Training set):  65.89994249568717
Classification Accuracy (Training set):  68.50680467701744
Classification Accuracy (Training set):  70.02108491470193
Classification Accuracy (Training set):  71.32451600536707
Classification Accuracy (Training set):  72.16791259344451
Classification Accuracy (Training set):  72.68545140885567
Classification Accuracy (Training set):  73.35633505846272
Classification Accuracy (Training set):  73.96971439524631
Classification Accuracy (Training set):  74.85144719187272
Classification Accuracy (Training set):  75.21564117308799
Classification Accuracy (Training set):  75.4839946329308
Classification Accuracy (Training set):  75.90569292696952
Classification Accuracy (Training set):  76.38489553383171
Classification Accuracy (Training set):  76.5574084723020

Displaying precision, recall, f1-score and support.

In [29]:
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import classification_report

target_names = np.array(['CC', 'DEM', 'DET', 'INJ', 'IRR', 'JJ', 'NN', 'NUM', 'PRP', 'PSP',
       'QC', 'RB', 'SYM', 'UT', 'VM', 'WQ'])

print(classification_report(actual_labels, pred_labels, target_names=target_names[np.unique(actual_labels)].tolist()))

              precision    recall  f1-score   support

          CC       0.80      0.53      0.64        15
         DEM       0.83      0.59      0.69        17
         DET       0.00      0.00      0.00        12
          JJ       0.20      0.10      0.14        49
          NN       0.63      0.77      0.69       306
         NUM       0.00      0.00      0.00         2
         PRP       0.74      0.63      0.68       110
         PSP       0.75      0.21      0.33        14
          QC       0.67      0.50      0.57        20
          RB       0.54      0.30      0.39        46
         SYM       0.94      0.99      0.96        83
          UT       1.00      0.89      0.94         9
          VM       0.65      0.74      0.69       190
          WQ       0.00      0.00      0.00         2

    accuracy                           0.67       875
   macro avg       0.55      0.45      0.48       875
weighted avg       0.65      0.67      0.65       875



Comparison between the Predicated labels and the actual Labels

In [34]:
lines = []  
l = []

for w in raw:
    w = w.encode('utf-8')    #Splitting list of words and decoding from UTF-8
    x = w.strip(b'\n').split(b' ')
    if x[0] == b'*' or x[0] == b'.' or len(x) < 2:
        if len(l) > 0:
            lines.append(l)
        l = []
        continue
    l.append((x[0], x[1]))
    
x_test = sentences[-100:]
y_test = labels[-100:]
l1=[]                                      # Actual
for i in range(len(x_test)):
    l1.extend(list(zip(x_test[i],y_test[i])))
print(l1[-100:])

[('“', 'SYM'), ('ನನ್ನ', 'PRP'), ('ತಂಟೆಗೆ', 'NN'), ('ಬರಬೇಡ', 'VM'), ('ನಾನೀಗ', 'PRP'), ('ಧರ್ಮದತ್ತನ', 'NN'), ('ಮನೆಗೆ', 'NN'), ('ಹೋಗುತ್ತಿದ್ದೇನೆ', 'VM'), ('ಇದೇ', 'PRP'), ('ದಾರಿಯಲ್ಲಿ', 'NN'), ('ಹಿಂದಕ್ಕೆ', 'RB'), ('ಬರುವೆ', 'VM'), ('ಅದುವರೆಗೆ', 'PSP'), ('ಸುಮ್ಮನಿರು', 'VM'), ('”', 'SYM'), ('ಎಂದಳು', 'VM'), ('ಅವಳ', 'PRP'), ('ಮಾತಿಗೆ', 'NN'), ('ಕಳ್ಳ', 'NN'), ('ಗಹ', 'RB'), ('ಗಹಿಸಿ', 'DET'), ('ನಕ್ಕ', 'VM'), ('“', 'SYM'), ('ಅವಳೆಲ್ಲಿ', 'PRP'), ('ಬರುವಳು', 'VM'), ('?', 'SYM'), ('“', 'SYM'), ('ಎಂದುಕೊಂಡ', 'VM'), ('ಆದರೂ', 'RB'), ('ಅವಳನ್ನು', 'PRP'), ('ತಡೆಯದೆ', 'VM'), (',', 'SYM'), ('ಹೋಗಲು', 'VM'), ('ಬಿಟ್ಟ', 'VM'), ('ಧರ್ಮದತ್ತನ', 'NN'), ('ಮನೆಗೆ', 'NN'), ('ಅವಳು', 'PRP'), ('ಹೋದಾಗ', 'VM'), ('ಅವನಿಗೆ', 'PRP'), ('ತಬ್ಬಿಬ್ಬಾಯಿತು', 'VM'), ('“', 'SYM'), ('ಮಾತುಕೊಟ್ಟು', 'VM'), ('ಹೀಗೆ', 'RB'), ('ಬಂದವರುಂಟೆ', 'VM'), ('?', 'SYM'), ('“', 'SYM'), ('ಎಂದುಕ಼ೊಂಡ', 'VM'), ('“', 'SYM'), ('ಇವಳು', 'PRP'), ('ನಡತೆ', 'NN'), ('ಕೆಟ್ಟವಳಲ್ಲ', 'VM'), ('ನನ್ನ', 'PRP'), ('ಸ್ನೇಹಿತನ', 'NN'), ('ತಂಗಿ', 'NN'), (',', 'SYM'), ('ಮದುವೆಯಾದವಳು', 'VM'), ('”', 

In [35]:

pred = le.inverse_transform(pred_labels[-100:])           # Predicted
l=[]
for i in range(len(x_test)):
    l.extend(list(zip(x_test[i],pred)))
print(l[-100:])

[('“', 'NN'), ('ನನ್ನ', 'NN'), ('ತಂಟೆಗೆ', 'NN'), ('ಬರಬೇಡ', 'NN'), ('ನಾನೀಗ', 'NN'), ('ಧರ್ಮದತ್ತನ', 'NN'), ('ಮನೆಗೆ', 'SYM'), ('ಹೋಗುತ್ತಿದ್ದೇನೆ', 'NN'), ('ಇದೇ', 'NN'), ('ದಾರಿಯಲ್ಲಿ', 'NN'), ('ಹಿಂದಕ್ಕೆ', 'SYM'), ('ಬರುವೆ', 'NN'), ('ಅದುವರೆಗೆ', 'NN'), ('ಸುಮ್ಮನಿರು', 'NN'), ('”', 'SYM'), ('ಎಂದಳು', 'NN'), ('ಅವಳ', 'NN'), ('ಮಾತಿಗೆ', 'NN'), ('ಕಳ್ಳ', 'SYM'), ('ಗಹ', 'NN'), ('ಗಹಿಸಿ', 'NN'), ('ನಕ್ಕ', 'NN'), ('“', 'NN'), ('ಅವಳೆಲ್ಲಿ', 'NN'), ('ಬರುವಳು', 'SYM'), ('?', 'NN'), ('“', 'NN'), ('ಎಂದುಕೊಂಡ', 'NN'), ('ಆದರೂ', 'NN'), ('ಅವಳನ್ನು', 'NN'), ('ತಡೆಯದೆ', 'SYM'), (',', 'NN'), ('ಹೋಗಲು', 'NN'), ('ಬಿಟ್ಟ', 'NN'), ('ಧರ್ಮದತ್ತನ', 'NN'), ('ಮನೆಗೆ', 'NN'), ('ಅವಳು', 'SYM'), ('ಹೋದಾಗ', 'NN'), ('ಅವನಿಗೆ', 'NN'), ('ತಬ್ಬಿಬ್ಬಾಯಿತು', 'NN'), ('“', 'NN'), ('ಮಾತುಕೊಟ್ಟು', 'NN'), ('ಹೀಗೆ', 'SYM'), ('ಬಂದವರುಂಟೆ', 'NN'), ('?', 'NN'), ('“', 'NN'), ('ಎಂದುಕ಼ೊಂಡ', 'NN'), ('“', 'NN'), ('ಇವಳು', 'NN'), ('ನಡತೆ', 'SYM'), ('ಕೆಟ್ಟವಳಲ್ಲ', 'NN'), ('ನನ್ನ', 'NN'), ('ಸ್ನೇಹಿತನ', 'NN'), ('ತಂಗಿ', 'SYM'), (',', 'NN'), ('ಮದುವೆಯಾದವಳು', 'NN'), ('”', 'NN'), ('ಎಂದು