# Imports

In [None]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from gensim.models import Word2Vec

In [None]:
import numpy as np

In [None]:
import string
import sys
import io 
import nltk
import os

#remove stopwords
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords 
nltk.download('stopwords')
stops = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')
#add to the stopwords list some archaic forms of pronouns that may be present in some translations
archaic = ['thou', 'thee', 'ye', 'thy', 'thine']
stopwords.extend(archaic)

#divide text in sentences based on points
nltk.download('punkt')

#tokenize strings in words
from nltk.tokenize import word_tokenize 

#lemmatization
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

#convert position tag in a form suitable for the lemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#count unique words
from collections import Counter

#explore results
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Introduction
We compare the corpora obtained with and without lemmatization for each religion. We focus on the number of total and unique words obtained, we print some random sentences and we look at the 10 most frequent words. 

Additionaly we perform an indicative analysis of the stability of the models trained on the lemmatized and non lemmatized corpora:

1) we create a list x of 5 manually selected words;

2) for each religion, we train 5 word2vec models on the lemmatized corpora;

3) for each model, we obtain the 5 most similar words to each of the words in the list x and we add them to a list y;

4) we count the number of unique words in list y;

5) we perform the same operations from point 2 but with non lemmatized corpora.

For each religion, we compare the values of unique words obtained for lemmatized and non lemmatized corpora. These values are general indicators of the stability of the models because they describe how many words change from one iteration to another in the list of 5 most similar to a given one. Ideally they should be equal to 25, the 5 most similar words to the 5 given ones don't change across the 5 models.


# Functions 

In [1]:
def getsentences(file_name):
  sentences = []

  with open(file_name + '_nl.txt', 'r') as fp:
      for line in fp:
          x = line[1:-2]
          x = x.replace('\'','')
          x = x.replace(' ', '')
          x = x.split(',')

          sentences.append(x)
          
  return sentences

In [None]:
def checkresults(sentences):
  #explore the results
  words = []
  for i in range(len(sentences)):
    for j in range(len(sentences[i])):
      words.append(sentences[i][j])

  #number of words
  print('N.words: ' + str(len(words)))

  #number of unique words
  word_count = Counter(words)
  keys = word_count.keys()
  print('N.unique words: '+ str(len(keys)))

  fract = round(len(words) / len(keys), 2)
  print('N.words / N.unique words: ' + str(fract))

  #most common words
  print('Most common words:')
  print(word_count.most_common(10))

  #print some sentences
  print('Random sentences:')
  randomlist = random.sample(range(0, len(sentences)), 25)
  for i in randomlist:
    print(sentences[i])

In [None]:
def compare_lem(filename):
  print('***WITH LEMMATIZATION***')
  os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_W2V')
  sentences = getsentences(filename)
  checkresults(sentences)

  print('***WITHOUT LEMMATIZATION***')
  os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_nl')
  sentences_nl = getsentences(filename + '_nl')
  checkresults(sentences_nl)

In [None]:
def get5words(sentences):
  random_words = ['woman', 'god', 'violence', 'fire', 'death']#hell for christ and isl
  words = []
  for i in range(5):
    model = Word2Vec(sentences = sentences,
                    min_count=10, #not consider word with absolute frequency <10 
                    size=300, #vector size 
                    sg = 1, #skipgram algorithm
                    hs = 0,
                    negative = 5, #negative sampling with 5 noise words
                    workers = 5, #faster process
                    iter = 6 #6 iterations
                    )
    for word in random_words:
      for tupl in model.wv.most_similar(word)[:5]:
        words.append(tupl[0])
    
  return len(np.unique(words))

In [None]:
def compare_stability(filename):
  os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_W2V')
  sentences = getsentences(filename)
  print('***WITH LEMMATIZATION***, n.unique words:', get5words(sentences))  
  
  os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_nl')
  sentences_nl = getsentences(filename + '_nl')
  print('***WITHOUT LEMMATIZATION***, n.unique words:', get5words(sentences_nl)) 

# Comparison 

## Bible

In [None]:
compare_lem('Christian_sentences')

***WITH LEMMATIZATION***
N.words: 998432
N.unique words: 14544
N.words / N.unique words: 68.65
Most common words:
[('say', 18547), ('shall', 17608), ('lord', 17167), ('god', 13274), ('come', 11181), ('son', 10162), ('go', 9781), ('unto', 8972), ('king', 8540), ('man', 7237)]
Random sentences:
['jehoshaphat', 'also', 'say', 'king', 'israel', 'first', 'seek', 'counsel', 'lord']
['egyptian', 'mourn', 'seventy', 'day']
['believe']
['thus', 'paul', 'go', 'among']
['binding', 'foal', 'vine', 'donkey', 'colt', 'choice', 'vine', 'wash', 'garment', 'wine', 'robe', 'blood', 'grape']
['priest', 'answer', 'unclean']
['zebah', 'zalmunna', 'karkor', 'host', 'fifteen', 'thousand', 'men', 'left', 'host', 'child', 'east', 'fell', 'hundred', 'twenty', 'thousand', 'men', 'drew', 'sword']
['son', 'merari', 'give', 'lot', 'accord', 'family', 'tribe', 'reuben', 'tribe', 'gad', 'tribe', 'zebulun', 'twelve', 'city']
['lord', 'hand', 'lift', 'see', 'shall', 'see', 'ashamed', 'envy', 'people', 'yea', 'fire', 'e

In [None]:
compare_stability('Christian_sentences')

***WITH LEMMATIZATION***
***WITH LEMMATIZATION***, n.unique words: 45
***WITHOUT LEMMATIZATION***
***WITHOUT LEMMATIZATION***, n.unique words: 45


## Quran

In [None]:
compare_lem('Islam_sentences')

***WITH LEMMATIZATION***
N.words: 784801
N.unique words: 16763
N.words / N.unique words: 46.82
Most common words:
[('allah', 18177), ('say', 14317), ('god', 11049), ('lord', 8197), ('people', 6437), ('one', 5865), ('give', 5610), ('make', 5609), ('day', 5174), ('come', 5068)]
Random sentences:
['thing', 'overflow', 'right', 'throat', 'choke']
['chosen', 'impose', 'difficulty', 'religion', 'cult', 'father', 'abraham']
['mankind']
['betray', 'allah', 'messenger', 'betray', 'knowingly', 'amdndt', 'thing', 'entrust', 'duty', 'allah', 'ordain']
['detailed', 'verse', 'people', 'remember']
['invisible', 'heaven', 'earth']
['whereof', 'use', 'doubt']
['allah', 'say', 'strengthen', 'arm', 'brother', 'give', 'power', 'shall', 'able', 'harm', 'aydt', 'proof', 'evidence', 'verse', 'lesson', 'sign', 'revelation', 'etc']
['low', 'raise', 'high']
['chosen', 'deity', 'earth']
['point']
['circular', 'motion', 'earthquake']
['god']
['preordainments', 'everything', 'bless', 'wretched', 'show', 'mankind',

In [None]:
compare_stability('Islam_sentences')

***WITH LEMMATIZATION***, n.unique words: 41
***WITHOUT LEMMATIZATION***, n.unique words: 46


## Hinduism

In [None]:
compare_lem('Hinduism_sentences')

***WITH LEMMATIZATION***
N.words: 687793
N.unique words: 15229
N.words / N.unique words: 45.16
Most common words:
[('indra', 9161), ('god', 8586), ('agni', 7747), ('may', 7321), ('one', 5446), ('sacrifice', 5307), ('come', 5256), ('soma', 5184), ('verily', 5121), ('make', 4214)]
Random sentences:
['pas', 'asvins', 'pas', 'away', 'beyond', 'tribe', 'selfish', 'men', 'wonderful', 'golden', 'path', 'gracious', 'bringers', 'flood']
['sing', 'glory', 'indra', 'say', 'solemn', 'eulogy']
['may', 'flow', 'give', 'wealth', 'thousand', 'heroic', 'power', 'godlike', 'soma', 'drop', 'effuse']
['flow', 'way', 'active']
['among', 'skilful', 'god', 'skilled', 'make', 'two', 'bring', 'prosperity', 'great', 'wisdom', 'measure', 'region', 'stablished', 'pillar', 'shall', 'decay']
['maintain', 'growth', 'wealth', 'men', 'sacrifice']
['language', 'express', 'must', 'finite', 'since', 'finite']
['three', 'veda']
['may', 'earn', 'glory', 'good', 'action']
['art', 'dear', 'friend', 'mighty', 'morning', 'shin

In [None]:
compare_stability('Hinduism_sentences')

***WITH LEMMATIZATION***, n.unique words: 48
***WITHOUT LEMMATIZATION***, n.unique words: 44


## Buddhism

In [None]:
compare_lem('Buddhism_sentences')

***WITH LEMMATIZATION***
N.words: 403186
N.unique words: 15349
N.words / N.unique words: 26.27
Most common words:
[('one', 5409), ('buddha', 3496), ('see', 3066), ('say', 2604), ('people', 2217), ('also', 1930), ('time', 1747), ('way', 1673), ('life', 1662), ('world', 1658)]
Random sentences:
['dharma', 'teacher', 'hui', 'yuan', 'response', 'great', 'way', 'profound', 'principle', 'abstruse']
['hearing', 'exhortation', 'people', 'world', 'decide', 'attain', 'fourth', 'meditative', 'state', 'absorption', 'body', 'dissolve', 'life', 'come', 'end', 'born', 'bṛhatphala', 'heaven']
['also', 'refer', 'text', 'rocana']
['forty', 'chapter', 'briefly', 'discuss', 'important', 'point']
['person', 'great', 'bright', 'wisdom', 'gradually', 'advance', 'discriminate', 'cognition', 'illuminates', 'immeasurably', 'immeasurably']
['reverence', 'also', 'well', 'train', 'reading', 'physiognomy', 'magnanimous', 'personality', 'divine', 'good', 'bad', 'omen', 'conduct', 'ceremonial', 'proceeding', 'sacriﬁc

In [None]:
compare_stability('Buddhism_sentences')

***WITH LEMMATIZATION***, n.unique words: 39
***WITHOUT LEMMATIZATION***, n.unique words: 44


# Conclusion
We have higher values of total words for non lemmatized corpora, this could be  due to non lemmatized forms of stop words that have not been eliminated.
As expected we also have higher values of unique words for the non lemmatized corpora since two non lemmatized forms of the same lemma are considered as different words. We have lower values of the ratio total words/unique words for the non lemmatized corpora.

Lemmatization increases the frequency of some common used verbs like come or say, because all the forms of the verb are brought to the basic form. By removing lemmatization some more interesting words for our analysis than this verbs appear among the top 10 most common words. As an example in the bible corpora we can see that without lemmatization 'israel' and 'yahweh' are among the 10 most common words in substitution of two verbs like 'come' and 'go'. A higher frequency percentile increase the reliability of a word(Hellrich&Hann, 2016) and we are more interested in increasing the reliabilty of words like 'yahweh' than common verbs.

Regarding the stability analysis, for the Bible the number of unique words is the same for lemmatized and non lemmatized corpora, for the Quran we have 6 more unique words with the non lemmatized corproa, for the Vedas and Upanishads we have 4 more unique words for the lemmatized corpora, for the Tripitaka we have 5 more unique words for the non lemmatized corpora. We can't say that one choice clearly outperform the other.

Given this considerations, we decide to use the non lemmatized corpora for our analysis.