In [19]:
#include libraries
import re
import numpy as np
from scipy.sparse import csc_matrix
from syllable import Encoder
import math
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
 
warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec
import multiprocessing

In [20]:
#generate ngrams function
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()

    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)

    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]

    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [21]:
# convert turkish letters to english letters
def turkish_to_english(string):
    choices = {"İ": "I", "ş": "s", "Ş": "S", "ı": "i", "ö": "o", "ü": "u", "ç": "c", "ğ": "g", "Ç": "C", "Ö": "O", "Ü": "U", "Ğ": "G", "â": "a", "î": "i", "û": "u", "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U", "Ê": "E", "Ô": "O", "â": "a", "î": "i", "û": "u", "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U", "Ê": "E", "Ô": "O", "â": "a", "î": "i", "û": "u",
               "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U", "Ê": "E", "Ô": "O", "â": "a", "î": "i", "û": "u", "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U", "Ê": "E", "Ô": "O", "â": "a", "î": "i", "û": "u", "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U", "Ê": "E", "Ô": "O", "â": "a", "î": "i", "û": "u", "ê": "e", "ô": "o", "Â": "A", "Î": "I", "Û": "U"}
    for i in range(len(string)):
        string = string.replace(
            string[i:i+1], choices.get(string[i], string[i]))
    return string

In [22]:
# parse string into syllables
consonant = ["b", "c", "d", "g", "ğ", "j", "l", "m", "n", "r",
             "v", "y", "z", "ç", "f", "h", "k", "p", "s", "ş", "t"]
vowel = ["a", "ı", "o", "u", "e", "i", "ö", "ü"]

# params chosen for demonstration purposes
encoder = Encoder(lang="tr", limitby="vocabulary", limit=3000)

def parse_syllable(string):
    string = turkish_to_english(string)
    return encoder.tokenize(string)

In [23]:
def parse_string_two(string):
    return string.split(" ")[1]
def parse_string_three(string):
    return string.split(" ")[2]

In [24]:
def count_element_matrix(sparse_matrix, element):
    count = 0
    for i in sparse_matrix.data:
        if(i == element):
            count += 1
    return count

In [25]:
def good_turing_smooting(ngram_matrix, ngrams, unique_ngrams):
    gt_smooth = np.zeros((len(unique_ngrams), len(unique_ngrams)))
    sparse_matrix = csc_matrix(ngram_matrix)
    count_one = count_element_matrix(sparse_matrix, 1)
    # calculate good turing smoothing
    for i in range(len(ngram_matrix)):
        for j in range(len(ngram_matrix[i])):
            if(ngram_matrix[i][j] == 0):
                gt_smooth[i][j] = count_one / len(ngrams)
            else:
                gt_smooth[i][j] = (ngram_matrix[i][j]+1) * \
                    count_element_matrix(sparse_matrix, ngram_matrix[i][j]+1) / \
                    count_element_matrix(sparse_matrix, ngram_matrix[i][j])
    return gt_smooth

In [26]:
def generate_bigram_matrix(unique_bigrams, bigrams):
    bigram_matrix = np.zeros((len(unique_bigrams), len(unique_bigrams)))
    for i in range(len(bigrams)-1):
        bigram_matrix[unique_bigrams.index(
            bigrams[i])][unique_bigrams.index(bigrams[i+1])] += 1
    gt_smooth = good_turing_smooting(bigram_matrix, bigrams, unique_bigrams)
    return gt_smooth

In [27]:
def generate_towgram_matrix(unique_towgrams, towgrams, unique_bigrams):
    towgram_matrix = np.zeros((len(unique_towgrams), len(unique_bigrams)))
    for i in range(len(towgrams)-1):
        towgram_matrix[unique_towgrams.index(
            towgrams[i])][unique_bigrams.index(parse_string_two(towgrams[i+1]))] += 1

    gt_smooth = good_turing_smooting(towgram_matrix, towgrams, unique_towgrams)
    return gt_smooth

In [28]:
def generate_threegram_matrix(unique_threegrams, threegrams, unique_bigrams):
    threegram_matrix = np.zeros((len(unique_threegrams), len(unique_bigrams)))
    for i in range(len(threegrams)-1):
        threegram_matrix[unique_threegrams.index(
            threegrams[i])][unique_bigrams.index(parse_string_three(threegrams[i+1]))] += 1

    gt_smoothing = good_turing_smooting(
        threegram_matrix, threegrams, unique_threegrams)
    return gt_smoothing

In [30]:
filename = input("Enter the file name: ")

#  Reads file
sample = open(filename, mode="r", encoding="utf-8")
s = sample.read()

# Replaces escape character with space
f = s.replace("\n", " ")

In [12]:
#training text for 1-gram
uni_sentences = []
cores = multiprocessing.cpu_count()

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        parsed_words = parse_syllable(j.lower())
        unigram = generate_ngrams(parsed_words, 1)
        for i in unigram:
            temp.append(i)

    uni_sentences.append(temp)
    

unigram_w2v_model = Word2Vec(min_count=20,
                 window=2,
                 sample=6e-5, 
                 alpha=0.03, 
                 min_alpha=0.0007, 
                 negative=20,
                 workers=cores-1)
unigram_w2v_model.build_vocab(uni_sentences, progress_per=10000)

unigram_w2v_model.train(uni_sentences, total_examples=unigram_w2v_model.corpus_count, epochs=30, report_delay=1)   

unigram_w2v_model.init_sims(replace=True)   
unigram_w2v_model.wv.save_word2vec_format('model1.txt', binary=False)

In [13]:
#training text for 2-gram
two_sentences = []
cores = multiprocessing.cpu_count()

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        parsed_words = parse_syllable(j.lower())
        twogram = generate_ngrams(parsed_words, 2)
        for i in twogram:
            temp.append(i)

    two_sentences.append(temp)

twogram_w2v_model = Word2Vec(min_count=20,
                 window=2,
                 sample=6e-5, 
                 alpha=0.03, 
                 min_alpha=0.0007, 
                 negative=20,
                 workers=cores-1)
twogram_w2v_model.build_vocab(two_sentences, progress_per=10000)

twogram_w2v_model.train(two_sentences, total_examples=twogram_w2v_model.corpus_count, epochs=30, report_delay=1)   

twogram_w2v_model.init_sims(replace=True)   
twogram_w2v_model.wv.save_word2vec_format('model2.txt', binary=False) 

In [14]:
#training text for 3-gram
three_sentences = []
cores = multiprocessing.cpu_count()

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        parsed_words = parse_syllable(j.lower())
        threegram = generate_ngrams(parsed_words, 3)
        for i in threegram:
            temp.append(i)

    three_sentences.append(temp)
    
threegram_w2v_model = Word2Vec(min_count=20,
                 window=2,
                 sample=6e-5, 
                 alpha=0.03, 
                 min_alpha=0.0007, 
                 negative=20,
                 workers=cores-1)
threegram_w2v_model.build_vocab(three_sentences, progress_per=10000)

threegram_w2v_model.train(three_sentences, total_examples=threegram_w2v_model.corpus_count, epochs=30, report_delay=1)   

threegram_w2v_model.init_sims(replace=True)
threegram_w2v_model.wv.save_word2vec_format('model3.txt', binary=False)


In [15]:
def test_most_similar_unigram():
    print("Most similar 1-gram words of 'ri': ")
    print(unigram_w2v_model.wv.most_similar(positive=["ri"]))
def  test_most_similar_twogram():
    print("Most similar 2-gram words of 'le ri': ")
    print(twogram_w2v_model.wv.most_similar(positive=["le ri"]))
def  test_most_similar_threegram():
    print("Most similar 3-gram words of 'le ri ni': ")
    print(threegram_w2v_model.wv.most_similar(positive=["le ri ni"])) 

In [16]:
def test_morf_analogy_unigram():
    print("Similarity between 'ri' and 'rin' for 1-gram words: ")
    print(unigram_w2v_model.wv.similarity("le", 'len'))
    print("Similarity between 'ni' and 'nin' for 1-gram words: ")
    print(unigram_w2v_model.wv.similarity("la", 'lan'))
def  test_morf_analogy_twogram():
    print("Similarity between 'le ri' and 'le rin' for 2-gram words: ")
    print(twogram_w2v_model.wv.similarity("le ri", 'le rin'))
    print("Similarity between 'la ri' and 'la rin' for 2-gram words: ")
    print(twogram_w2v_model.wv.similarity("la ri", 'la rin'))
def  test_morf_analogy_threegram():
    print("Similarity between 'le ri ni' and 'le ri nin' for 3-gram words: ")
    print(threegram_w2v_model.wv.similarity("le ri ni", 'le ri nin'))
    print("Similarity between 'la ri ni' and 'la ri nin' for 3-gram words: ")
    print(threegram_w2v_model.wv.similarity("la ri ni", 'la ri nin')) 

In [17]:
#call tests of most similar
test_most_similar_unigram()
print("-------------------")
test_most_similar_twogram()
print("-------------------")
test_most_similar_threegram()
print("-------------------")

Most similar 1-gram words of 'ri': 
[('ve', 0.5166339874267578), ('rin', 0.5153515338897705), ('riy', 0.4740566611289978), ('nin', 0.4531644284725189), ('bir', 0.42421960830688477), ('le', 0.40507903695106506), ('ler', 0.38981080055236816), ('ni', 0.36597421765327454), ('re', 0.3542775511741638), ('ye', 0.3479315936565399)]
-------------------
Most similar 2-gram words of 'le ri': 
[('le rin', 0.7796775698661804), ('le re', 0.7353426814079285), ('ri ni', 0.7129860520362854), ('ri ne', 0.6914221048355103), ('ri nin', 0.667730450630188), ('le riy', 0.6645482182502747), ('rin den', 0.6493914127349854), ('rin de', 0.595539927482605), ('riy le', 0.5305780172348022), ('me le', 0.502592921257019)]
-------------------
Most similar 3-gram words of 'le ri ni': 
[('le ri nin', 0.673933744430542), ('le ri ne', 0.6614874601364136), ('le ri dir', 0.564606785774231), ('le riy le', 0.5148169994354248), ('et me le', 0.4556577503681183), ('nim le ri', 0.4555213451385498), ('de yis le', 0.430831581354141

In [18]:
#call tests of morphlogy analogy
test_morf_analogy_unigram()
print("-------------------")
test_morf_analogy_twogram()
print("-------------------")
test_morf_analogy_threegram()
print("-------------------")

Similarity between 'ri' and 'rin' for 1-gram words: 
0.46499738
Similarity between 'ni' and 'nin' for 1-gram words: 
0.5558649
-------------------
Similarity between 'le ri' and 'le rin' for 2-gram words: 
0.7796776
Similarity between 'la ri' and 'la rin' for 2-gram words: 
0.79042023
-------------------
Similarity between 'le ri ni' and 'le ri nin' for 3-gram words: 
0.6739337
Similarity between 'la ri ni' and 'la ri nin' for 3-gram words: 
0.6962771
-------------------
