In [None]:
import os
from sklearn import svm
import numpy as np
import glob
import re
import string
from sklearn.model_selection import train_test_split 
import nltk
from collections import defaultdict
%pip install sklearn_crfsuite
from sklearn_crfsuite import CRF, scorers

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 6.0MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:
#mount
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#move to current working directory
work_dir = '/content/drive/My Drive/Colab Notebooks/NLP/project/'
os.chdir(work_dir)
%ls

100001.txt  100002.txt  [0m[01;34mkok[0m/  konkani.pos  Konkani_POStagger.ipynb


In [None]:
 with open('konkani.pos','r',encoding='utf-8') as f:
      for line in f:
        print(line)

In [None]:
#Class to read dataset
class Data:
  def __init__(self,fname):  
    self.pos_data = []
    self.training_sentences =[] 
    self.test_sentences=[]

    with open(fname,'r',encoding='utf-8') as f:
      for line in f:
        self.pos_data.append(line.strip('\n')[:-1])

  #Splitting the data into train and test
  def split_data(self,split_ratio):
    split_index = int(len(self.pos_data) * split_ratio)
    train_data =  self.pos_data[:split_index]
    test_data = self.pos_data[split_index:]
    return train_data,test_data

  #Separating the token and labels for the training and test dataset
  def create_train_data(self,data,test=False):
    for line in data:
      if test:
        self.test_sentences.append([(x[0],x[1]) for word in line.split(' ') for x in [word.rsplit('/', 1)]])
      else:
        self.training_sentences.append([(x[0],x[1]) for word in line.split(' ') for x in [word.rsplit('/', 1)]])


In [None]:
data = Data('konkani.pos')
train_data , test_data = data.split_data(0.9)
data.create_train_data(train_data)
data.create_train_data(test_data,True)

In [None]:
len(data.training_sentences), len(data.test_sentences)

(686, 77)

In [None]:
#Class to define features for the CRF Model
class Features:
  def __init__(self):
    self.features = dict()

  def _is_punctuation(self,word):
    if word in string.punctuation:
      return "True"
    else:
      return "False"

  def create_word_features(self,sentence,i):
    feature = defaultdict()
    sentence = sentence
    word = sentence[i][0]
    tag = sentence[i][1]
    #morphology related features
    feature= {
        "bias":1.0,
        "word_prefix1":word[:1],
        "word_prefix2":word[:2],
        "word_suffix1":word[-1:],
        "word_suffix2":word[-2:],
        "is_punct": self._is_punctuation(word),
        "is_digit": str(word.isdigit()),
        "word_length": str(len(word)),
        "tag": tag
    }
    #word -1 token & tag
    if i > 0:
      feature["w-1"] = sentence[i-1][0] 
      feature["t-1"] = sentence[i-1][1] 
    else: 
      feature["w-1"] = '_'
      feature["t-1"] = '_'
    #word -2 token & tag
    if i > 1:
      feature["w-2"] = sentence[i-2][0]
      feature["t-2"] = sentence[i-2][1] 
    else:
      feature["w-2"] = '_'
      feature["t-2"] = '_'
    
    #word +1 token & tag
    if i+1 < len(sentence):
      feature["w+1"] = sentence[i+1][0] 
      feature["t+1"] = sentence[i+1][1] 
    else:
      feature["w+1"] = '_'
      feature["t+1"] = '_' 
    #word +2 token & tag
    if i+2 < len(sentence):
      feature["w+2"] = sentence[i+2][0] 
      feature["t+2"] = sentence[i+2][1]
    else:
      feature["w+2"] = '_'
      feature["t+2"] = '_'

    return feature


In [None]:
#Creating the feature set from training data
crf_feature = Features()
feature_set = []
feature_labels = []
for sent in data.training_sentences:
  feature_set.append([crf_feature.create_word_features(sent,i) for i in range(len(sent))])
  feature_labels.append([word[1] for word in sent])

In [None]:
#Training the CRF model
x_train_crf, x_val_crf, y_train_crf, y_val_crf = train_test_split(feature_set,feature_labels,test_size=0.1)
crf_model = CRF(
    algorithm ='lbfgs',
    c1 = 0.1,
    c2 = 0.1,
    max_iterations = 10,
    all_possible_transitions = True 
)

In [None]:
print(x_train_crf[0])
print(y_train_crf[0])

[{'bias': 1.0, 'word_prefix1': 'द', 'word_prefix2': 'द', 'word_suffix1': 'द', 'word_suffix2': 'द', 'is_punct': 'False', 'is_digit': 'False', 'word_length': '1', 'tag': 'N-NNP', 'w-1': '_', 't-1': '_', 'w-2': '_', 't-2': '_', 'w+1': 'हांणी', 't+1': 'PR-PRP', 'w+2': '1899', 't+2': 'QT-QTC'}, {'bias': 1.0, 'word_prefix1': 'ह', 'word_prefix2': 'हा', 'word_suffix1': 'ी', 'word_suffix2': 'णी', 'is_punct': 'False', 'is_digit': 'False', 'word_length': '5', 'tag': 'PR-PRP', 'w-1': 'द', 't-1': 'N-NNP', 'w-2': '_', 't-2': '_', 'w+1': '1899', 't+1': 'QT-QTC', 'w+2': 'वर्सा', 't+2': 'N-NN'}, {'bias': 1.0, 'word_prefix1': '1', 'word_prefix2': '18', 'word_suffix1': '9', 'word_suffix2': '99', 'is_punct': 'False', 'is_digit': 'True', 'word_length': '4', 'tag': 'QT-QTC', 'w-1': 'हांणी', 't-1': 'PR-PRP', 'w-2': 'द', 't-2': 'N-NNP', 'w+1': 'वर्सा', 't+1': 'N-NN', 'w+2': 'आनी', 't+2': 'CC-CCD'}, {'bias': 1.0, 'word_prefix1': 'व', 'word_prefix2': 'वर', 'word_suffix1': 'ा', 'word_suffix2': 'सा', 'is_punct': 

In [None]:
crf_model.fit(x_train_crf,y_train_crf)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=10,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [None]:
#Function to calculate accuracy of predicted labels
def calc_accuracy(y_true,y_pred):
  correct = 0
  total = 0
  for sent_true, sent_pred in zip(y_true, y_pred):
      for true_label, pred_label in zip(sent_true, sent_pred):
          if true_label == pred_label:
              correct += 1
          total += 1
  return correct/total

y_pred_crf = crf_model.predict(x_val_crf)
accuracy_CRF = calc_accuracy(y_val_crf,y_pred_crf)
print("CRF Model Validation Accuracy: ", accuracy_CRF)

CRF Model Validation Accuracy:  0.9940357852882704


In [None]:
#Creating the feature set for test set
X_test_crf=[]
y_test_crf=[]
for sent in data.test_sentences:
  X_test_crf.append([crf_feature.create_word_features(sent,i) for i in range(len(sent))])
  y_test_crf.append([word[1] for word in sent])

#Testing the model for the test set
test_pred_crf = crf_model.predict(X_test_crf)
test_CRF = calc_accuracy(y_test_crf,test_pred_crf)
print("CRF Model Test Accuracy: ", test_CRF)

CRF Model Test Accuracy:  0.9832214765100671


In [None]:
sentence = 'पळेवन-पळेवन तुंकां कांय दिसचें ना'.split(' ')
test = []
test.append([crf_feature.create_word_features(sentence,i) for i in range(len(sentence))])

In [None]:
crf_model.predict(test)

[['CC-CCS', 'PR-PRL', 'N-NNP', 'N-NNP', 'RD-PUNC']]

In [None]:
from gensim.utils import tokenize
from gensim.models import word2vec

#list of files for raw text
raw_txt_files = glob.glob('./kok/*.txt')
print(len(raw_txt_files))
EMB_DIM = 100
# Create an UNK token for unknown words
UNK_INDEX = 0 
UNK_TOKEN = "UNK"

#Class to process data for the RNNs and create word vectors
class NeuralData:
  def __init__(self,files):
    self.data = []
    self.raw_data_text = []
    self.read_text(files)

  def preprocess_text(self,text,fname):
    #Remove the english alphabets from the raw text
    text = re.findall("[^\u0000-\u05C0\u2100-\u214F]+|[.,!?;()]", text)
    #remove half space characters left in the text
    text = list(filter(('\u200c').__ne__, text))
    text = list(filter(('\u200e').__ne__, text))
    return text

  def read_text(self, files):
    count = 0
    for file in files:
      with open(file,'r') as f: 
        #data = f.read().splitlines()
        self.data = f.read()
        self.data = self.preprocess_text(self.data,f.name)
        
        self.raw_data_text.append(self.data)
        count+=1
      len(self.raw_data_text)

  def create_word_vectors(self):
    tokens = word2vec.Word2Vec(self.raw_data_text,size=EMB_DIM,window =5, min_count=1)
    word_vectors = tokens.wv  # get trained embeddings - an KeyedVector instaces
    return word_vectors 

  def data2index(self,word_set,tag_set):
    #use indices from the word & tag list to create an index dictionary
    word_idx = {w: i for i, w in enumerate(word_set)} 
    tag_idx = {t: i for i, t in enumerate(tag_set)}

    # updating the indexes of words for the UNK token
    word_idx = {word: (index + 1) if index >= UNK_INDEX else index 
                for word, index in word_idx.items()}
    word_idx[UNK_TOKEN] = UNK_INDEX
    return word_idx,tag_idx


374


In [None]:
neural_data = NeuralData(raw_txt_files)

In [None]:
#Training the RNN model 
#calculate unique list of words
word_set = list(set([word[0] for sent in data.training_sentences for word in sent]))
word_set.append('ENDPAD')

#calculate unique list of tags
tag_set = list(set([word[1] for sent in data.training_sentences for word in sent]))
tag_set.append('ENDPAD')

#create word and label indices 
word_idx, tag_idx = neural_data.data2index(word_set,tag_set)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
#the max length of the sentence set
maxLength =max([len(sent) for sent in data.training_sentences])

#the following lines will create an index vector for text and labels 
#and add padding to the sentences
#padded sentences are assigned 'ENDPAD' tag
X = [[word_idx[w[0]] for w in s] for s in  data.training_sentences]
X = pad_sequences(maxlen=maxLength, sequences=X, padding="post", value=len(word_set)-1)
y = [[tag_idx[w[1]] for w in s] for s in  data.training_sentences]
y = pad_sequences(maxlen=maxLength, sequences=y, padding="post", value=tag_idx["ENDPAD"])
#create one hot vector of labels
y = [to_categorical(i, num_classes=len(tag_set)) for i in y]

#split data set into train and validation
X_train,X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=42)

In [None]:
#using Keras to build an RNN
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

input = Input(shape=(maxLength,))
model = Embedding(input_dim=len(word_set), output_dim=50, input_length=maxLength)(input)  # 50-dim embedding
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.2))(model)  # variational biLSTM
out = TimeDistributed(Dense(len(tag_set), activation="softmax"))(model)  # softmax output layer


#training the model
model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()
history = model.fit(X_train, np.array(y_train), batch_size=5, epochs=15, validation_data=(X_val,np.array(y_val)), verbose=1)

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 130)]             0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 130, 50)           115950    
_________________________________________________________________
dropout_9 (Dropout)          (None, 130, 50)           0         
_________________________________________________________________
bidirectional_9 (Bidirection (None, 130, 128)          58880     
_________________________________________________________________
time_distributed_14 (TimeDis (None, 130, 37)           4773      
Total params: 179,603
Trainable params: 179,603
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epo

In [None]:
#create word index vectors for the test set.
X_test = [[word_idx.get(w[0],word_idx['UNK']) for w in s] for s in  data.test_sentences]
X_test_pad = pad_sequences(maxlen=maxLength, sequences=X_test, padding="post", value=len(word_set) - 1)
test_labels = [[tag_idx[w[1]] for w in s] for s in  data.test_sentences]
test_labels_pad = pad_sequences(maxlen=maxLength, sequences=test_labels, padding="post", value=tag_idx["ENDPAD"])


In [None]:
y_pred_test = model.predict(X_test_pad)
p = np.argmax(y_pred_test,axis=-1)

model_test_acc = calc_accuracy(test_labels_pad,p)
print("RNN Model Test accuracy is ", model_test_acc)

RNN Model Test accuracy is  0.9771228771228772


In [None]:
#RNN Model with word embeddings
word_vectors = neural_data.create_word_vectors()
#calculate unique list of words from the word vectors
word_set_wv = list(set([w for w in word_vectors.vocab.keys()]))

word_idx_wv = {k: v.index for k, v in word_vectors.vocab.items()}

# we add one single vector for the Unknown words
embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0)

embedding_matrix = np.insert(embedding_matrix, [UNK_INDEX], [unk_vector], axis=0)

# updating the indexes of words that follow the new word
word_idx_wv = {word: (index + 1) if index >= UNK_INDEX else index 
            for word, index in word_idx_wv.items()}
word_idx_wv[UNK_TOKEN] = UNK_INDEX


In [None]:
#Red-doing the word index vectors for training sentences with the word vector vocabulary. 
#This isnt repeated for the test since the labels are the same
X_wts = [[word_idx_wv.get(w[0],word_idx_wv['UNK']) for w in s] for s in  data.training_sentences]
X_pad_wts = pad_sequences(maxlen=maxLength, sequences=X_wts, padding="post", value=len(word_idx_wv)-1)

#split data set into train and validation
X_train_wts,X_val_wts, y_train_wts, y_val_wts = train_test_split(X_pad_wts,y,test_size=0.1,random_state=42)

In [None]:
#RNN model using pre-trained word embeddings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Layer

vocab_length = len(embedding_matrix)
weights_model = Sequential()
#model.add(Input(shape=(maxLength,)))
weights_model.add(Embedding(input_dim=vocab_length, output_dim=EMB_DIM, weights=[embedding_matrix] ,input_length=maxLength , trainable=False))
weights_model.add(Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.2)))
weights_model.add(TimeDistributed(Dense(64, activation="tanh")))
weights_model.add(Dropout(0.2))
weights_model.add(TimeDistributed(Dense(len(tag_set), activation="softmax")))
weights_model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
print(weights_model.summary())


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 130, 100)          13852100  
_________________________________________________________________
bidirectional_8 (Bidirection (None, 130, 128)          84480     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 130, 64)           8256      
_________________________________________________________________
dropout_8 (Dropout)          (None, 130, 64)           0         
_________________________________________________________________
time_distributed_13 (TimeDis (None, 130, 37)           2405      
Total params: 13,947,241
Trainable params: 95,141
Non-trainable params: 13,852,100
_________________________________________________________________
None


In [None]:
weights_model.fit(X_train_wts, np.array(y_train_wts), batch_size=10, epochs=20, validation_data=(X_val_wts, np.array(y_val_wts)), verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f612b0cb710>

In [None]:
#Creating the test set with the word vectors
X_test_wts = [[word_idx_wv.get(w[0],word_idx_wv['UNK']) for w in s] for s in  data.test_sentences]
X_testwts_pad = pad_sequences(maxlen=maxLength, sequences=X_test_wts, padding="post", value=len(word_idx_wv) - 1)
test_labels = [[tag_idx[w[1]] for w in s] for s in  data.test_sentences]
test_labels_pad = pad_sequences(maxlen=maxLength, sequences=test_labels, padding="post", value=tag_idx["ENDPAD"])


In [None]:
y_pred_wts = weights_model.predict(X_testwts_pad)
p_wts = np.argmax(y_pred_wts,axis=-1)
#Calculating model accuracy on test
model_wts_acc = calc_accuracy(test_labels_pad,p_wts)
print("CNN Model with word embeddings Test accuracy is ", model_wts_acc)

CNN Model with word embeddings Test accuracy is  0.948951048951049
