In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from keras.utils.data_utils import pad_sequences
import pickle

In [13]:
model_path = '../../models/biLSTM/normalCase/bilstmSize2.h5'
tokenizer_path = '../../models/biLSTM/normalCase/tokenizerBilstmSize2.pickle'
padding_length = 2

In [14]:
## loading trained model. A summary of the model architecture is also presented.
with tf.device('/cpu:0'):
    loaded_model = tf.keras.models.load_model(model_path)

loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 300)            28987200  
                                                                 
 dropout (Dropout)           (None, 2, 300)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              186880    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 387       
                                                                 
Total params: 29,174,467
Trainable params: 187,267
Non-trainable params: 28,987,200
_________________________________________________________________


In [15]:
from keras.utils.vis_utils import plot_model
plot_model(loaded_model, to_file='bigru_attn_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [16]:
## loading tokenizer. 
with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)


In [17]:
# classes are [0] for Bilingual; [1] for Maori; [2] for English
def detectCodeSwitchingPoint(x: str):
    seq = tokenizer.texts_to_sequences([x])
    padded = pad_sequences(seq, maxlen=padding_length)
    predict = loaded_model.predict(padded)
    classes = np.argmax(predict, axis=1)
    if classes == 0:
        y = x.split()
        cw = []
        wb = []
        for i in y:
            if re.search(u'[āēīōūĀĒĪŌŪ]', i):  # adding hand crafted rules
                cw.append(1)
                wb.append(i)
                continue
            elif re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', i):
                cw.append(2)
                wb.append(i)
            else:
                seq1 = tokenizer.texts_to_sequences([i])
                padded1 = pad_sequences(seq1, maxlen=padding_length)
                predict1 = loaded_model.predict(padded1)
                classw = np.argmax(predict1, axis=1)
                classw = int(classw[0])
                cw.append(classw)
                wb.append(i)

            result = []
            flag = True
            for c in range(len(cw)-1):
                if cw[c] == cw[c+1]:
                    continue
                elif cw[c] != cw[c+1]:
                    if flag:
                        result.append([c+1])
                    else:
                        result[-1].append(c+1)
                    flag = not flag

        return result
    else:
        return []


In [18]:
# classes are [0] for Bilingual; [1] for Maori; [2] for English

def sentenceCategory(sentence: str) -> int:
  seq = tokenizer.texts_to_sequences([sentence])
  padded = pad_sequences(seq, maxlen=padding_length)
  predict = loaded_model.predict(padded) 
  classw = np.argmax(predict,axis=1)
  return int(classw[0])

def detectCodeSwitchingPointDynamicWindowVersion(x: str, w: int) -> list():
    wordsList = x.split()
    end = len(wordsList)
    if w >= end and end > 2:
        w = end - 1
    elif end == 1:
        w = 1
    elif end == 2:
        w = 2
    else:
        pass

    if end < 1:
        return []

    elif end == 1:
        if re.search(u'[āēīōūĀĒĪŌŪ]', x):
            return [1]
        elif re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', x):
            return [2]
        else:
            return [sentenceCategory(x)]

    elif end == 2:
        if not re.search(u'[āēīōūĀĒĪŌŪ]', x):
            if sentenceCategory(x) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', x):
                return [1, 1]
            elif sentenceCategory(x) == 2:
                return [2, 2]
            else:
                if sentenceCategory(wordsList[0]) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', wordsList[0]):
                    return [1, 2]
                else:
                    return [2, 1]
        else:
            if re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[0]) and re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[1]):
                return [1, 1]
            if re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[0]) and not re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[1]):
                return [1, 2]
            else:
                return [2, 1]
    
    else:
        result = []
        ptr = 0
        while ptr < end:
            thisWindow = wordsList[ptr:ptr+w]
            if ptr + w > end:
                w = end - ptr
            else:
                pass
            if sentenceCategory(" ".join(thisWindow)) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', " ".join(thisWindow)):
                result.extend([1 for _ in range(w)])
            elif sentenceCategory(" ".join(thisWindow)) == 2 and not re.search(u'[āēīōūĀĒĪŌŪ]', " ".join(thisWindow)):
                result += [2 for _ in range(w)]
            else:
                if w >= 4 and w % 2 == 0:
                    result += detectCodeSwitchingPointDynamicWindowVersion(" ".join(thisWindow), w-2)
                else:
                    result += detectCodeSwitchingPointDynamicWindowVersion(" ".join(thisWindow), w-1)
            ptr += w
        return result


In [19]:
phrase = "Ko ngā ērā This is a trial Ko ngā ērā This is a trial Ko ngā ērā This is a trial"
print(detectCodeSwitchingPointDynamicWindowVersion(phrase, 5))

2022-06-17 15:47:09.691206: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-06-17 15:47:10.404552: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


[1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2]


In [20]:
df = pd.read_csv("../../small_data.csv")

df.drop(columns=['id', 'number', 'Labels_Final'], inplace=True)

newDf = pd.DataFrame(columns=['text', 'label'])

WINDOW_SIZE = 2

for row in df.itertuples():
    text = row.text.split(' ')
    label = row.label.split(',')
    if len(text) > len(label):
        text = text[:len(label)]
    elif len(text) < len(label):
        label = label[:len(text)]
    else:
        pass
    
    for i in range(0, len(text), WINDOW_SIZE):
        newDf.loc[len(newDf)] = [' '.join(text[i:i+WINDOW_SIZE]), ','.join(label[i:i+WINDOW_SIZE])]

def labelGen(text: str) -> str:
    """Read labels and generate the final label"""
    if "P" in text and "M" in text:
        return "B"  # Bilingual
    elif "P" in text:
        return "P"  # English or English + numbers
    elif "M" in text:
        return "M"  # Māori or Māori + numbers
    elif "N" in text:
        return "N" # Pure numbers
    else:
        return "U"  # Unknown

newDf['Labels_Final'] = newDf['label'].apply(labelGen)

df = newDf.replace({'Labels_Final': {'P': 2, 'M': 1, 'B':0}})
df['Labels_Final'] = df['Labels_Final'].astype(int)
df.head()

Unnamed: 0,text,label,Labels_Final
0,Will the,"P,P",2
1,Tertiary Education,"P,P",2
2,Commission be,"P,P",2
3,measuring the,"P,P",2
4,improvement in,"P,P",2


In [21]:
sentence_label_error = 0
word_label_error = 0

for ind, row in df.iterrows():
    x = row['text']
    l = row['Labels_Final']
    lw = row['label']
    ly = lw.split(",")
    ly = [item.replace("P", "2") for item in ly]
    ly = [item.replace("M", "1") for item in ly]

    for i, j in zip(detectCodeSwitchingPointDynamicWindowVersion(x, 2), ly):
        if i != int(j):
            word_label_error += 1
            # break
        

total_words = df['text'].apply(lambda x: len(str(x).split(' '))).sum()


print(" ")
print("------------------------------------------")
print("Total sentence label error", sentence_label_error)
print(" ")
print("Total number of words",  total_words)
print("Total word label error in bilingual sentences", word_label_error)


 
------------------------------------------
Total sentence label error 0
 
Total number of words 2872
Total word label error in bilingual sentences 37
