In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from keras.utils.data_utils import pad_sequences
import pickle

In [2]:
model_path = '../../models/biLSTM/normalCase/bilstm.h5'
tokenizer_path = '../../models/biLSTM/normalCase/tokenizerBilstm.pickle'
padding_length = 250

# os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [3]:
## loading trained model. A summary of the model architecture is also presented.
with tf.device('/cpu:0'):
    loaded_model = tf.keras.models.load_model(model_path)

loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 300)          23180100  
                                                                 
 dropout (Dropout)           (None, 250, 300)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              186880    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 387       
                                                                 
Total params: 23,367,367
Trainable params: 187,267
Non-trainable params: 23,180,100
_________________________________________________________________


In [5]:
from keras.utils.vis_utils import plot_model
plot_model(loaded_model, to_file='bigru_attn_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [4]:
## loading tokenizer. 
with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)


In [6]:
phrase = "Ko ngā ērā This is a trial Ko ngā ērā This is a trial Ko ngā ērā This is a trial"
seq = tokenizer.texts_to_sequences([phrase])
padded = pad_sequences(seq, maxlen=padding_length)
print([f'{r*100:.2f}%' for r in loaded_model.predict(padded)[0]])

['30.87%', '68.22%', '0.91%']


In [7]:
# Hybrid-model ***MODEL 2***
# classes are [0] for Bilingual; [1] for Maori; [2] for English

# A few random samples ### Change these sentences
use_samples = ['Ko ngā ērā This is a trial', 'The winners will be chosen by their kaitiaki (tribal guardians)', 'Running very late, been here almost 30 mins. Haere mai please Flyer.',
               'Ko ngā ngeru ērā', 'Great workshop in Nelson today, thanks to iwi, central + regional gov, community groups, NGOs & industry who took part', 'good morning']

for x in use_samples:
    seq = tokenizer.texts_to_sequences([x])
    padded = pad_sequences(seq, maxlen=padding_length)
    predict = loaded_model.predict(padded)
    classes = np.argmax(predict, axis=1)
    if classes == 0:
        print(" ")
        print("Bilingual sentence:", x)
        y = x.split()
        cw = []
        wb = []
        for i in y:
            if re.search(u'[āēīōūĀĒĪŌŪ]', i):  # adding hand crafted rules
                classw = np.array([1])
                cw.append(classw)
                wb.append(i)
        #    print(wb,":",cw)
                continue
            elif re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', i):
                classw = np.array([2])
                cw.append(classw)
                wb.append(i)
            else:
                seq1 = tokenizer.texts_to_sequences([i])
                padded1 = pad_sequences(seq1, maxlen=padding_length)
                predict1 = loaded_model.predict(padded1)
                classw = np.argmax(predict1, axis=1)
                cw.append(classw)
                wb.append(i)
        # print(wb,":",cw)
        # print(wb,":",cw)
        for c in range(len(cw)-1):
            if cw[c] == cw[c+1]:
                continue
            elif cw[c] != cw[c+1]:
                print("code-switch detected after the word",
                      "{", wb[c], "} and {", wb[c+1], "}")


2022-06-26 13:52:11.961795: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8101
2022-06-26 13:52:13.232222: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


 
Bilingual sentence: Ko ngā ērā This is a trial
code-switch detected after the word { Ko } and { ngā }
code-switch detected after the word { ērā } and { This }
 
Bilingual sentence: The winners will be chosen by their kaitiaki (tribal guardians)
code-switch detected after the word { their } and { kaitiaki }
code-switch detected after the word { kaitiaki } and { (tribal }
 
Bilingual sentence: Running very late, been here almost 30 mins. Haere mai please Flyer.
code-switch detected after the word { mins. } and { Haere }
code-switch detected after the word { mai } and { please }
 
Bilingual sentence: Great workshop in Nelson today, thanks to iwi, central + regional gov, community groups, NGOs & industry who took part
code-switch detected after the word { to } and { iwi, }
code-switch detected after the word { iwi, } and { central }


In [8]:
# classes are [0] for Bilingual; [1] for Maori; [2] for English
def detectCodeSwitchingPoint(x: str):
    seq = tokenizer.texts_to_sequences([x])
    padded = pad_sequences(seq, maxlen=padding_length)
    predict = loaded_model.predict(padded)
    classes = np.argmax(predict, axis=1)
    if classes == 0:
        y = x.split()
        cw = []
        wb = []
        for i in y:
            if re.search(u'[āēīōūĀĒĪŌŪ]', i):  # adding hand crafted rules
                cw.append(1)
                wb.append(i)
                continue
            elif re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', i):
                cw.append(2)
                wb.append(i)
            else:
                seq1 = tokenizer.texts_to_sequences([i])
                padded1 = pad_sequences(seq1, maxlen=padding_length)
                predict1 = loaded_model.predict(padded1)
                classw = np.argmax(predict1, axis=1)
                classw = int(classw[0])
                cw.append(classw)
                wb.append(i)

            result = []
            flag = True
            for c in range(len(cw)-1):
                if cw[c] == cw[c+1]:
                    continue
                elif cw[c] != cw[c+1]:
                    if flag:
                        result.append([c+1])
                    else:
                        result[-1].append(c+1)
                    flag = not flag

        return result
    else:
        return []


In [9]:
# classes are [0] for Bilingual; [1] for Maori; [2] for English

def sentenceCategory(sentence: str) -> int:
  seq = tokenizer.texts_to_sequences([sentence])
  padded = pad_sequences(seq, maxlen=padding_length)
  predict = loaded_model.predict(padded) 
  classw = np.argmax(predict,axis=1)
  return int(classw[0])

def detectCodeSwitchingPointDynamicWindowVersion(x: str, w: int) -> list():
    wordsList = x.split()
    end = len(wordsList)
    if w >= end and end > 2:
        w = end - 1
    elif end == 1:
        w = 1
    elif end == 2:
        w = 2
    else:
        pass

    if end < 1:
        return []

    elif end == 1:
        if re.search(u'[āēīōūĀĒĪŌŪ]', x):
            return [1]
        elif re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', x):
            return [2]
        else:
            return [sentenceCategory(x)]

    elif end == 2:
        if not re.search(u'[āēīōūĀĒĪŌŪ]', x):
            if sentenceCategory(x) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', x):
                return [1, 1]
            elif sentenceCategory(x) == 2:
                return [2, 2]
            else:
                if sentenceCategory(wordsList[0]) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', wordsList[0]):
                    return [1, 2]
                else:
                    return [2, 1]
        else:
            if re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[0]) and re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[1]):
                return [1, 1]
            if re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[0]) and not re.search(u'[āēīōūĀĒĪŌŪ]', wordsList[1]):
                return [1, 2]
            else:
                return [2, 1]
    
    else:
        result = []
        ptr = 0
        while ptr < end:
            thisWindow = wordsList[ptr:ptr+w]
            if ptr + w > end:
                w = end - ptr
            else:
                pass
            if sentenceCategory(" ".join(thisWindow)) == 1 and not re.search(u'[bBcCdDfFgGjJlLqQsSvVxXyYzZ]', " ".join(thisWindow)):
                result.extend([1 for _ in range(w)])
            elif sentenceCategory(" ".join(thisWindow)) == 2 and not re.search(u'[āēīōūĀĒĪŌŪ]', " ".join(thisWindow)):
                result += [2 for _ in range(w)]
            else:
                if w >= 4 and w % 2 == 0:
                    result += detectCodeSwitchingPointDynamicWindowVersion(" ".join(thisWindow), w-2)
                else:
                    result += detectCodeSwitchingPointDynamicWindowVersion(" ".join(thisWindow), w-1)
            ptr += w
        return result


In [10]:
phrase = "Ko ngā ērā This is a trial Ko ngā ērā This is a trial Ko ngā ērā This is a trial"
print(detectCodeSwitchingPointDynamicWindowVersion(phrase, 5))
print(detectCodeSwitchingPoint(phrase))

[1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2]
[[1, 3], [8, 10], [15, 17]]


In [11]:
df = pd.read_csv("../../small_data.csv")

df = df.replace({'Labels_Final': {'P': 2, 'M': 1, 'B':0}})
df['Labels_Final'] = df['Labels_Final'].astype(int)
df.head()

Unnamed: 0,id,number,text,label,Labels_Final
0,H20031118,36,Will the Tertiary Education Commission be meas...,"P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,M,P,P,P,P,P",0
1,H20031118,54,What progress is being made on Treaty of Waita...,"P,P,P,P,P,P,P,P,M,P",0
2,H20031118,59,We will also be shortly signing another deed o...,"P,P,P,P,P,P,P,P,P,P,P,M,M",0
3,H20031118,71,"The Office of Treaty Settlements, with Te Puni...","P,P,P,P,P,P,M,M,M,P,P,P,P,P,P,P,P,P,P,P,P,P,P,...",0
4,H20031118,74,When will the Minister undertake a comprehensi...,"P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,...",0


In [13]:
for window in range(2, 10):
    sentence_label_error = 0
    word_label_error = 0

    for ind, row in df.iterrows():
        x = row['text']
        l = row['Labels_Final']
        lw = row['label']
        ly = lw.split(",")
        ly = [item.replace("P", "2") for item in ly]
        ly = [item.replace("M", "1") for item in ly]

        for i, j in zip(detectCodeSwitchingPointDynamicWindowVersion(x, window), ly):
            if i != int(j):
                word_label_error += 1
                # break
            

    total_words = df['text'].apply(lambda x: len(str(x).split(' '))).sum()


    print(" ")
    print("------------------------------------------")
    print("Window size: ", window)
    print("Total sentence label error", sentence_label_error)
    print("Total number of words",  total_words)
    print("Total word label error in bilingual sentences", word_label_error)


 
------------------------------------------
Window size:  2
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 33
 
------------------------------------------
Window size:  3
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 36
 
------------------------------------------
Window size:  4
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 30
 
------------------------------------------
Window size:  5
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 38
 
------------------------------------------
Window size:  6
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 31
 
------------------------------------------
Window size:  7
Total sentence label error 0
Total number of words 2872
Total word label error in bilingual sentences 31
 
--