In [None]:
!pip install emoji

import tensorflow as tf
import pandas as pd
import re
import emoji
import numpy as np
import matplotlib.pyplot as plt
import io

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 2170
embedding_dim = 300
max_length = 20
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 700
num_epochs = 128

In [None]:
#All of the sinhala chars availble in the current Sinhala keyboard.
sinhalaChars = ["අ", "ආ", "ඇ", "ඈ", "ඉ", "ඊ","උ", "ඌ", "ඍ", "ඎ", "ඏ", "ඐ","එ", "ඒ", "ඓ", "ඔ", "ඕ", "ඖ","ං", "ඃ",
"ක", "ඛ", "ග", "ඝ", "ඞ", "ඟ","ච", "ඡ", "ජ", "ඣ", "ඤ", "ඥ", "ඦ","ට", "ඨ", "ඩ", "ඪ", "ණ", "ඬ","ත", "ථ", "ද", "ධ",
"න", "ඳ","ප", "ඵ", "බ", "භ", "ම", "ඹ", "ය", "ර", "ල", "ව","ශ", "ෂ", "ස", "හ", "ළ", "ෆ","෴", "\u200d"]

#\u200d is known as the ZERO WIDTH JOINER. It is required in special cases when using Sinhala language.
sinhalaVowels = ["්", "ා", "ැ", "ෑ", "ි", "ී", "ු", "ූ", "ෘ", "ෙ", "ේ", "ෛ", "ො", "ෝ","ෞ", "ෟ", "ෲ", "ෳ", "ර්‍"]

#A dictonary created to validate the vowel errors in the texts
vowelsFixed = {"ෙ" + "්": "ේ", "්" + "ෙ": "ේ", "ෙ" + "ා": "ො", "ා" + "ෙ": "ො", "ේ" + "ා": "ෝ", "ො" + "්": "ෝ", "ෙෙ": "ෛ",
"ෘෘ": "ෲ", "ෙ" + "ෟ": "ෞ", "ෟ" + "ෙ": "ෞ", "ි" + "ී": "ී", "ී" + "ි": "ී", "ේ" + "්": "ේ", "ේ" + "ෙ": "ේ", "ො" + "ා": "ො",
"ො" + "ෙ": "ො", "ෝ" + "ා": "ෝ", "ෝ" + "්": "ෝ", "ෝ" + "ෙ": "ෝ", "ෝ" + "ේ": "ෝ", "ෝ" + "ො": "ෝ", "ෞ" + "ෟ": "ෞ",
"ෞ" + "ෙ": "ෞ", "ො" + "ෟ": "ෞ", "ෟ" + "ො": "ෞ",}

#A dictonary created to simplify special characters used in sinhala words to minimize the variety
simplifiedChars = {"ඛ": "ක", "ඝ": "ග", "ඟ": "ග", "ඡ": "ච", "ඣ": "ජ", "ඦ": "ජ", "ඤ": "ඥ", "ඨ": "ට", "ඪ": "ඩ", "ණ": "න",
"ඳ": "ද", "ඵ": "ප", "භ": "බ", "ඹ": "බ", "ශ": "ෂ", "ළ": "ල", "ආ": "අ", "ඈ": "ඇ", "ඊ": "ඉ", "ඌ": "උ", "ඒ": "එ", "ඕ": "ඔ",
"ා": "", "ෑ": "ැ", "ී": "ි", "ූ": "ු", "ේ": "ෙ", "ෝ": "ො", "ෲ": "ෘ"}

def isSinhalaLetter(char: str) -> bool:
  return char in sinhalaChars

def isSinhalaVowel(char: str) -> bool:
  return char in sinhalaVowels

def getFixedVowel(vowel: str) -> str:
  return vowelsFixed[vowel]

def getSimplifiedChar(character: str) -> str:
  if len(character) != 1:
    raise TypeError("character should be a string with length 1")
  try:
    return simplifiedChars[character]
  except KeyError:
    return character

In [None]:
def replaceUrl(text: str) -> str:
  return re.sub(r'(http://www\.|https://www\.|http://|https://)[a-z0-9]+([\-.]{1}[a-z0-9A-Z/]+)*', '', text)

def removeRetweetState(text: str) -> str:
  return re.sub(r'^RT @\w*: ', '', text)

def replaceMention(text: str) -> str:
  return re.sub(r'@\w*', '', text)

def splitTokens(text: str) -> list:
  emojis = ''.join(emj for emj in emoji.EMOJI_DATA.keys())
  return [token for token in re.split(r'[.…,‌ ¸‚\"/|—¦”‘\'“’´!@#$%^&*+\-£?˜()\[\]{\}:;–Ê  �‪‬‏0123456789' + emojis + ']', text) if token != ""]

def setSpacesAmongEmojis(text: str) -> str:
  modified_text = ""
  for c in text:
    modified_text += c
    if c in emoji.UNICODE_EMOJI:
      modified_text += " "
  return modified_text

def simplifySinhalaText(text: str) -> str:
  modified_text = ""
  for c in text:
    modified_text += getSimplifiedChar(c)
  return modified_text

def stemWord(word: str) -> str:
  if len(word) < 4:
    return word
  # remove 'ට'
  if word[-1] == 'ට':
    return word[:-1]
  # remove 'ද'
  if word[-1] == 'ද':
    return word[:-1]
  # remove 'ටත්'
  if word[-3:] == 'ටත්':
    return word[:-3]
  # remove 'එක්'
  if word[-3:] == 'ෙක්':
    return word[:-3]
  # remove 'එ'
  if word[-1:] == 'ෙ':
    return word[:-1]
  # remove 'ක්'
  if word[-2:] == 'ක්':
    return word[:-2]
  # remove 'ගෙ' (instead of ගේ because this step comes after simplifying text)
  if word[-2:] == 'ගෙ':
    return word[:-2]
  # else
  return word

def filterText(text: str) -> list:
  return [stemWord(token) for token in splitTokens(replaceUrl(replaceMention(simplifySinhalaText(removeRetweetState(text.strip('"')).lower()))))]

def simplifySinhala(rawtext: str) -> str:
    simplifiedList = filterText(rawtext)
    simplifiedStr = ""

    for word in simplifiedList:
        simplifiedStr = simplifiedStr + word + " "

    return simplifiedStr.rstrip()

In [None]:
sentences = []
labels = []

url = "https://raw.githubusercontent.com/RusiruWijethilake/DepFlow/main/dataset.csv"
df = pd.read_csv(url)

labels = df['label']

for sent in df['text']:
  sentences.append(simplifySinhala(sent))

In [None]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

vocab_size = len(word_index)

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(24, activation='tanh'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

In [None]:
!mkdir -p saved_model
model.save('saved_model/depflow_model_trained')
model.save('depflow_trained_model.h5')

In [None]:
sentence = [simplifySinhala("මට ඇති මේ දුක දරන් හිටියා."), simplifySinhala("මට සතුටුයි හොදටම")]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

In [56]:
def check_depressive(post: str):
  post_sequence = tokenizer.texts_to_sequences(simplifySinhala(post))
  padded_post_sequence = pad_sequences(post_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  post_prediction = model.predict(padded_post_sequence)
  label = post_prediction.max().round()
  if label >= 1 :
    print(post, " : is a depressive post")
  else:
    print(post, " : is not a depressive post")

check_depressive("මට ඇති මේ දුක දරන් හිටියා.")
check_depressive("මම මැරිලම යන්නම්")
check_depressive("මම මරිලා ගියාම හැමොටම හොදයි.")
check_depressive("දුක කියන්නෙ හෙට වැඩට යන්න තිබීමයි")
check_depressive("අපි හෙට උදේට මොනවද කන්නේ?")
check_depressive('ඇයි මට මෙච්චර දුකක් දෙන්නේ දෙවියනේ')
check_depressive('ඔයාටනම් ඉතින් හිනා')
check_depressive('මම හිතන්නේ මට තනිකම දැනෙනවා වැඩී')
check_depressive('mata godak dukayi')
check_depressive('I feel like I want to cry')
check_depressive('මම මේ දවස් ටිකේම දුක දරාගන්න බැරුව හොදටම ඇඩුවා')

import subprocess

subprocess.run("pbcopy", text=True, input=str(word_index))

මට ඇති මේ දුක දරන් හිටියා.  : is a depressive post
මම මැරිලම යන්නම්  : is a depressive post
මම මරිලා ගියාම හැමොටම හොදයි.  : is a depressive post
දුක කියන්නෙ හෙට වැඩට යන්න තිබීමයි  : is a depressive post
අපි හෙට උදේට මොනවද කන්නේ?  : is a depressive post
ඇයි මට මෙච්චර දුකක් දෙන්නේ දෙවියනේ  : is a depressive post
ඔයාටනම් ඉතින් හිනා  : is not a depressive post
මම හිතන්නේ මට තනිකම දැනෙනවා වැඩී  : is a depressive post
mata godak dukayi  : is a depressive post
I feel like I want to cry  : is a depressive post
මම මේ දවස් ටිකේම දුක දරාගන්න බැරුව හොදටම ඇඩුවා  : is a depressive post


CompletedProcess(args='pbcopy', returncode=0)