In [2]:
# Create five word2vec embedding spaces (with different random seeds) for each language of the quran.

from collections import Counter
from gensim.models import word2vec
import sys
import pickle
import string

# SET THESE VARIABLES

# Location where the Quran text is stored
quran_path='/Users/nehakardam/Project-CSE517/Quran/'

# Location where output embedding spaces will be stored
# Files will be stored in the format {output_path}{language}_{word2vec_seed}.pkl, where the pickle file is a pickled gensim word2vec model
output_path = quran_path

# List of word2vec seeds to use (can adjust if needed, or leave the same)
seeds = [2518,2548,2590,29,401]

# The following is a list of Quran translations that can be used to create embedding spaces for (can adjust if needed, or leave the same)
# (The next line of code will remove the .txt ending from each language, so this list assumes that each language will have the .txt ending)
languages=['hindi.txt', 'english.txt','bulgarian.txt', 'arabic.txt']
languages=[i[:-4] for i in languages] #remove .txt

# Create embeddings for each language
for language in languages:
    print(language)

    # Create embeddings for each seed for that language
    for seed in seeds:
        print(seed)

        # Read in quran text
        with open(quran_path+language+'.txt','r') as text_file:
            sentences = text_file.readlines()
            sentences = [i[:-1].split(' ') for i in sentences]

        # Create word2vec embedding space
        model = word2vec.Word2Vec(sentences,window=5,min_count=5,seed=seed)

        # Save model for future use
        with open(output_path+language+'_'+str(seed)+'.pkl', 'wb') as pickle_file:
            pickle.dump(model,pickle_file)


hindi
2518
2548
2590
29
401
english
2518
2548
2590
29
401
bulgarian
2518
2548
2590
29
401
arabic
2518
2548
2590
29
401


# Below section precalculate the five nearest neighbors

In [3]:

# Precalculate the five nearest neighbors for every word for every language in the Quran.

import faiss
import time
import tables as tb
import pickle
from sklearn.neighbors import BallTree
import numpy as np
from sklearn.preprocessing import normalize
from tqdm import tqdm,trange
import sys
import pandas as pd

# SET THESE VARIABLES

# Files should be stored in the format {quran_path}{language}_{word2vec_seed}.pkl, where the pickle file is a pickled gensim word2vec model
quran_path = '/Users/nehakardam/Project-CSE517/Quran/'

# Location where output nearest neighbors will be stored
# Files will be stored in the format {output_path}{language}_{word2vec_seed}.pkl, where the pickle file is a dictionary where the keys are words and the values are lists of ten nearest neighbors for each word.
output_path=quran_path+'nearestNeighbors/'

# List of word2vec seeds to use (can adjust if needed, or leave the same)
seeds = [2518,2548,2590,29,401]

# List of quran translations (can adjust if needed, or leave the same)
# (The next line of code will remove the .txt ending from each language, so this list assumes that each language will have the .txt ending)
languages=['english.txt', 'arabic.txt','hindi.txt', 'bulgarian.txt']
languages=[i[:-4] for i in languages] #remove .txt

# Precalculate nearest neighbors for each language
for language in languages:
    print(language)

    # Precalculate nearest neighbors for each seed for that language
    for seed in seeds:
        print(seed)

       # Read in embedding space model
        print('Load model...')
        with open(quran_path+language+'_'+str(seed)+'.pkl','rb') as pickleFile:
            model = pickle.load(pickleFile)
        embedding_words = list(model.wv.index_to_key)
        embeddings = [model.wv[word] for word in embedding_words]

        xb = np.array([[float(j) for j in i[1:]] for i in embeddings],dtype='float32') #database

        print('Normalizing vectors')
        xb = normalize(xb)
    # 		for i in trange(len(xb)):
    # 			xb[i] = normalize(xb[i].reshape(-1, 1))

        d = xb.shape[1] #dimension
        nb = xb.shape[0] #database size
        nq = len(embedding_words) #num queries
        print('d',d)
        print('nb',nb)
        print('nq',nq)

        print('Creating query matrix...')
        xq = xb[[i for i in range(len(embedding_words))],:]
        print(xq.shape)

        print('Building index...')
        faiss_index = faiss.IndexFlatL2(d)
        faiss_index.add(xb) 

        k = 11 #number of nearest neighbors

        print('Calculating nearest neighbors...')
        D, I = faiss_index.search(xq, k)

        nearestNeighbors = {}
        print('Recording nearest neighbors...')
        for i in tqdm(range(len(embedding_words))):
            word = embedding_words[i]
            nearestNeighbors[word] = [embedding_words[j] for j in I[i]][1:]

        #Save final
        print('Saving nearest neighbors...')
        with open(output_path+language+'_'+str(seed)+'.pkl','wb') as pickleFile:
            pickle.dump(nearestNeighbors,pickleFile)


english
2518
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 18596.03it/s]


Saving nearest neighbors...
2548
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 31/31 [00:00<00:00, 102300.10it/s]


Saving nearest neighbors...
2590
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 91759.65it/s]


Saving nearest neighbors...
29
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 31/31 [00:00<00:00, 111512.37it/s]


Saving nearest neighbors...
401
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 58385.01it/s]


Saving nearest neighbors...
arabic
2518
Load model...
Normalizing vectors
d 99
nb 34
nq 34
Creating query matrix...
(34, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 34/34 [00:00<00:00, 64121.55it/s]


Saving nearest neighbors...
2548
Load model...
Normalizing vectors
d 99
nb 34
nq 34
Creating query matrix...
(34, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 34/34 [00:00<00:00, 71589.53it/s]


Saving nearest neighbors...
2590
Load model...
Normalizing vectors
d 99
nb 34
nq 34
Creating query matrix...
(34, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 34/34 [00:00<00:00, 95773.23it/s]


Saving nearest neighbors...
29
Load model...
Normalizing vectors
d 99
nb 34
nq 34
Creating query matrix...
(34, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 34/34 [00:00<00:00, 109697.18it/s]


Saving nearest neighbors...
401
Load model...
Normalizing vectors
d 99
nb 34
nq 34
Creating query matrix...
(34, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 34/34 [00:00<00:00, 98349.20it/s]


Saving nearest neighbors...
hindi
2518
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 99634.81it/s]


Saving nearest neighbors...
2548
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 31/31 [00:00<00:00, 113557.58it/s]


Saving nearest neighbors...
2590
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 96313.65it/s]

Saving nearest neighbors...





29
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 31/31 [00:00<00:00, 105796.11it/s]


Saving nearest neighbors...
401
Load model...
Normalizing vectors
d 99
nb 31
nq 31
Creating query matrix...
(31, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|███████████████████████████████████████| 31/31 [00:00<00:00, 106839.30it/s]


Saving nearest neighbors...
bulgarian
2518
Load model...
Normalizing vectors
d 99
nb 23
nq 23
Creating query matrix...
(23, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 83019.79it/s]


Saving nearest neighbors...
2548
Load model...
Normalizing vectors
d 99
nb 23
nq 23
Creating query matrix...
(23, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 89822.15it/s]


Saving nearest neighbors...
2590
Load model...
Normalizing vectors
d 99
nb 23
nq 23
Creating query matrix...
(23, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 95137.07it/s]


Saving nearest neighbors...
29
Load model...
Normalizing vectors
d 99
nb 23
nq 23
Creating query matrix...
(23, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 94949.80it/s]


Saving nearest neighbors...
401
Load model...
Normalizing vectors
d 99
nb 23
nq 23
Creating query matrix...
(23, 99)
Building index...
Calculating nearest neighbors...
Recording nearest neighbors...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 77923.26it/s]

Saving nearest neighbors...





# Stability for each language in the Quran

In [5]:
# Calculate stability for each language in the Quran.

import numpy as np
from sklearn.neighbors import BallTree
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import sys
from tqdm import tqdm,trange
import pandas as pd

# SET THESE VARIABLES

# Files should be stored in the format {quran_path}{language}_{word2vec_seed}.pkl, where the pickle file is a dictionary where the keys are words and the values are lists of ten nearest neighbors for each word.
quran_path = '/Users/nehakardam/Project-CSE517/Quran/nearestNeighbors/'

# Location where output stability will be stored
# Files will be stored in the format {output_path}{language}.csv, where the csv file has columns "word" and "stability", and the stability value is recorded for each word
output_path = '/Users/nehakardam/Project-CSE517/Quran/stability/'

# List of word2vec seeds to use (can adjust if needed, or leave the same)
seeds = [2518,2548,2590,29,401]

# List of quran translations (can adjust if needed, or leave the same)
# (The next line of code will remove the .txt ending from each language, so this list assumes that each language will have the .txt ending)
    
languages=['english.txt', 'arabic.txt','hindi.txt', 'bulgarian.txt']
languages=[i[:-4] for i in languages] #remove .txt

# Calculates the stability of a word in two sets of embedding spaces
# Assumes that you've already calculated the most similar words for the word
#
# @param word
#    The word to calculate stability for
# @param similar1
#    The list of nearest neighbors to word in the first set of embedding spaces
#    len(similar1) = # of embedding spaces in the first set
#    For each i, len(similar1[i]) = # of nearest neighbors to consider (same for each i)
# @param similar2
#    The list of nearest neighbors to word in the second set of embedding spaces
# @param same
#    Are the two lists of embedding spaces the same? (default = False)
#
# @returns a float, the average stability of the word across the two sets of spaces
#
def stability(word,similar1,similar2,same=False):
    if same and len(similar1) == 1:
        return len(similar1[0])
    
    sets1 = [set(a) for a in similar1]
    if not same:
        sets2 = [set(b) for b in similar2]
    else:
        sets2 = sets1
    
    avgOverlap = 0
    for i in range(len(similar1)):
        for j in range(len(similar2)):
            if not same or (same and i!=j):
                avgOverlap += len(sets1[i] & sets2[j])

    if same:
        avgOverlap /= (len(similar1)*len(similar2)-len(similar1))
    else:
        avgOverlap /= (len(similar1)*len(similar2))
    return avgOverlap

# Calculate stability for each language
for language in languages:
    print(language)

    print('Reading ten nearest neighbors...')
    nearest_neighbors = []
    words = set()
    for seed in seeds:
        print(seed)
        with open(quran_path+language+'_'+str(seed)+'.pkl','rb') as pickleFile:
            nearest_neighbors.append(pickle.load(pickleFile))
            _words = set(nearest_neighbors[-1].keys())
            if len(words)==0:
                words = _words
            else:
                words = words.intersection(_words)
    words = list(words)

    print('Calculating stabilities...')
    stabilities = []
    for word in tqdm(words):
        most_similar = []
        for i in range(5):
            most_similar.append(nearest_neighbors[i][word])
        stabilities.append(stability(word,most_similar,most_similar,True))

    print('Writing output file...')
    df = pd.DataFrame(data={'word':words,'stability':stabilities})
    df.to_csv(output_path+language+'.csv')


english
Reading ten nearest neighbors...
2518
2548
2590
29
401
Calculating stabilities...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 26908.82it/s]


Writing output file...
arabic
Reading ten nearest neighbors...
2518
2548
2590
29
401
Calculating stabilities...


100%|████████████████████████████████████████| 34/34 [00:00<00:00, 32783.07it/s]


Writing output file...
hindi
Reading ten nearest neighbors...
2518
2548
2590
29
401
Calculating stabilities...


100%|████████████████████████████████████████| 31/31 [00:00<00:00, 26513.75it/s]


Writing output file...
bulgarian
Reading ten nearest neighbors...
2518
2548
2590
29
401
Calculating stabilities...


100%|████████████████████████████████████████| 23/23 [00:00<00:00, 26143.36it/s]

Writing output file...



