# Imports

In [None]:
import string
import sys
import io 
import nltk

#remove stopwords
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords 
nltk.download('stopwords')
stops = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')
#add to the stopwords list some archaic forms of pronouns that may be present in some translations
archaic = ['thou', 'thee', 'ye', 'thy', 'thine']
stopwords.extend(archaic)
  
#divide text in sentences based on points
nltk.download('punkt')

#tokenize strings in words
from nltk.tokenize import word_tokenize 

#lemmatization
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

#convert position tag in a form suitable for the lemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#count unique words
from collections import Counter

#explore results
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
#os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/HolyText_Corpora')

# Introduction
To increase the size of the training corpora, we decide to use more translations of each considered holy book (Saeed et al., 2020). We decide how many translations to use for each holy book considering the availability of free copies online and with the goal of having similar values of total and unique words for each corpora.

We preprocess the collected text with the following steps:
- convert uppercase to lowercase;
- remove non alphanumeric characters;
- remove punctuation;
- remove stop words.

Since we are not sure if performing lemmatization is a good idea we save two different preprocessed versions for each corpora, one with lemmatization and one without. We will decide later if lemmatization is a good idea or not.

After the preprocessing, the dimensions of the corpora are as follows:
- with lemmatization

| Religion      | Holy text | N. of translations | N.words | N.unique words |
| ----------- | ----------- | ------------------ | ------- | -------------- |
| Christianity| Bible       | 3 | 998432 | 14544
| Islam   | Quran        |   8  | 784578 | 16735
| Hinduism | Vedas and Upanishads | 2 and 2 | 687782 | 15227
| Buddhism | Tripitaka | 8 books of the Tripitaka | 403179 | 15347

- without lemmatization:

| Religion      | Holy text | N. of translations | N.words | N.unique words |
| ----------- | ----------- | ------------------ | ------- | -------------- |
| Christianity| Bible       | 3 | 1004875 | 19157
| Islam   | Quran        |   8  | 791238 | 22069
| Hinduism | Vedas and Upanishads | 2 and 2 | 697742 | 18874
| Buddhism | Tripitaka | 8 books of the Tripitaka | 404717 | 19699








# Functions 

In [None]:
def preprocessing(file_name):

  output=""
  with open(file_name, encoding = 'utf-8-sig') as f:
      for line in f:
          if not line.isspace():#remove empty lines
              output+=line

  #divide the output text in sentences, based on points
  output_sentences = nltk.tokenize.sent_tokenize(output)

  #remove first 100 and last 100 sentences that normally are licences, greetings, ..
  output_sentences = output_sentences[100:-100]

  filtered_sentences = []
  #'clean' every sentence, one by one
  for sentence in output_sentences:
    #capital letters to lower
    lower_sentence=sentence.lower()
    #remove non alphanumeric characters
    noalfa_sentence = [w for w in word_tokenize(lower_sentence) if (w.isalpha()==True)]
    #lemmatize
    lemmatized_sentence = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in noalfa_sentence]
    #remove stopwords and words of one character that may not be included in the stopwords list
    filtered_sentence = [w for w in lemmatized_sentence if ((w not in stopwords) and (len(w) > 1))]
    #insert the filtered and tokenized sentence in the final list
    if filtered_sentence:
      filtered_sentences.append(filtered_sentence)

  return filtered_sentences

In [None]:
#preprocessing without lemmatization
def preprocessing_nolemm(file_name):

  output=""
  with open(file_name, encoding = 'utf-8-sig') as f:
      for line in f:
          if not line.isspace():#remove empty lines
              output+=line

  #divide the output text in sentences, based on points
  output_sentences = nltk.tokenize.sent_tokenize(output)

  #remove first 100 and last 100 sentences that normally are licences, greetings, ..
  output_sentences = output_sentences[100:-100]

  filtered_sentences = []
  #'clean' every sentences, one by one
  for sentence in output_sentences:
    #capital letters to lower
    lower_sentence=sentence.lower()
    #remove non alphanumeric characters
    noalfa_sentence = [w for w in word_tokenize(lower_sentence) if (w.isalpha()==True)]
    #remove stopwords and words of one character that may not be included in the stopwords list
    filtered_sentence = [w for w in noalfa_sentence if ((w not in stopwords) and (len(w) > 1))]
    #insert the filtered and tokenized sentence in the final list
    if filtered_sentence:
      filtered_sentences.append(filtered_sentence)

  return filtered_sentences

In [None]:
def checkresults(sentences):
  #explore the results
  words = []
  for i in range(len(sentences)):
    for j in range(len(sentences[i])):
      words.append(sentences[i][j])

  #number of words
  print('Words: ' + str(len(words)))

  #number of unique words
  word_count = Counter(words)
  keys = word_count.keys()
  print('Unique words: '+ str(len(keys)))

  #most common words
  print('Most common words:')
  print(word_count.most_common(10))

  #print some sentences
  print('Random sentences:')
  randomlist = random.sample(range(0, len(sentences)), 25)
  for i in randomlist:
    print(sentences[i])

In [None]:
def savesentences(filename, sentences):
  with open(filename + '.txt', 'w') as fp:
    for sentence in sentences:
      fp.write(str(sentence) + '\n')

In [None]:
#save sentences not lemmatized
def savesentencesnl(filename, sentences):
  #os.chdir('/content/drive/MyDrive/Magistrale/Secondo semestre/DS/Progetto/Sentences_nl')
  with open(filename + '_nl.txt', 'w') as fp:
    for sentence in sentences:
      fp.write(str(sentence) + '\n')

# Preprocessing

## Christianity

In [None]:
Christian_files = ['KingJamesVersion.txt', 'NIV-Bible.txt','WorldEnglishBible.txt']
Christian_sentences = []
Christian_sentences_nl = []

for file_name in Christian_files:
  Christian_sentences = Christian_sentences + preprocessing(file_name)
  Christian_sentences_nl = Christian_sentences_nl + preprocessing_nolemm(file_name)

In [None]:
checkresults(Christian_sentences)

Words: 1004875
Unique words: 19157
Most common words:
[('shall', 17608), ('lord', 17074), ('god', 12527), ('said', 11024), ('unto', 8972), ('king', 7537), ('man', 7235), ('son', 7098), ('israel', 7009), ('yahweh', 6807)]
Random sentences:
['concord', 'hath', 'christ', 'belial']
['comes', 'defy', 'israel']
['whence', 'cometh', 'wisdom']
['others', 'mocking', 'said', 'filled', 'new', 'wine']
['david', 'saith', 'book', 'psalms', 'lord', 'said', 'unto', 'lord', 'sit', 'right', 'hand', 'till', 'make', 'enemies', 'footstool']
['forgat', 'lord', 'god', 'sold', 'hand', 'sisera', 'captain', 'host', 'hazor', 'hand', 'philistines', 'hand', 'king', 'moab', 'fought']
['struck', 'souls', 'therein', 'edge', 'sword', 'utterly', 'destroying', 'none', 'left', 'breathed', 'burnt', 'hazor', 'fire']
['cut', 'olive', 'tree', 'wild', 'nature', 'contrary', 'nature', 'grafted', 'cultivated', 'olive', 'tree', 'much', 'readily', 'natural', 'branches', 'grafted', 'olive', 'tree']
['shall', 'commit', 'adultery']
[

In [None]:
checkresults(Christian_sentences_nl)

In [None]:
savesentences('Christian_sentences',Christian_sentences)

In [None]:
savesentencesnl('Christian_sentences',Christian_sentences)

## Islam



In [None]:
Islam_files =  ['clearquran.txt','habib-shakir.txt','marmaduke.txt','quranalhilali-khan.txt', 'sarwar.txt','Quran-Saheeh.txt', 'yaq.txt','YusufAli.txt']
Islam_sentences = []
Islam_sentences_nl = []

for file_name in Islam_files:
  Islam_sentences = Islam_sentences + preprocessing(file_name)
  Islam_sentences_nl = Islam_sentences_nl + preprocessing_nolemm(file_name)
  

In [None]:
checkresults(Islam_sentences)

Words: 791238
Unique words: 22069
Most common words:
[('allah', 18177), ('god', 10529), ('lord', 8142), ('said', 6920), ('people', 6299), ('say', 6188), ('one', 5049), ('day', 4746), ('shall', 4524), ('us', 4502)]
Random sentences:
['cf']
['see', 'footnote']
['said', 'indeed', 'sent', 'people']
['believe', 'good', 'good', 'final', 'state', 'shall', 'goodly', 'return']
['unbelievers', 'hope']
['appoint', 'henchman', 'folk', 'aaron', 'brother']
['thought', 'harm', 'would', 'come', 'willfully', 'blind', 'deaf']
['name', 'ofallah', 'gracious', 'merciful']
['never', 'compass', 'anything', 'knowledge', 'except', 'wills']
['cf']
['let', 'present', 'life', 'deceive', 'let', 'chief', 'deceiver', 'deceive', 'allah']
['may', 'either', 'time', 'noontide', 'siesta', 'business', 'suspended', 'even', 'egypt', 'time', 'night', 'people', 'usually', 'asleep']
['cf']
['lodge', 'section', 'dwell', 'means', 'harm', 'order', 'oppress', 'pregnant', 'spend', 'give', 'birth']
['decree', 'already', 'recorded', 

In [None]:
checkresults(Islam_sentences_nl)

In [None]:
savesentences('Islam_sentences',Islam_sentences)

In [None]:
savesentencesnl('Islam_sentences',Islam_sentences)

## Hinduism

In [None]:
files_Hinduism = ['paramanda_upanishads.txt','the-4-vedas.txt','17001079-Four-Vedas-English-Translation.txt', '4Upanishads - Shukla Yajur Veda.txt']
Hinduism_sentences = []
Hinduism_sentences_nl = []

for file_name in files_Hinduism:
  Hinduism_sentences = Hinduism_sentences + preprocessing(file_name)
  Hinduism_sentences_nl = Hinduism_sentences_nl + preprocessing_nolemm(file_name)

In [None]:
checkresults(Hinduism_sentences)

Words: 697742
Unique words: 18874
Most common words:
[('us', 9479), ('indra', 9161), ('agni', 7731), ('may', 7321), ('gods', 6141), ('verily', 5121), ('soma', 5080), ('one', 4683), ('sacrifice', 4405), ('come', 4361)]
Random sentences:
['pusan']
['hymn', 'cxxiii']
['armour', 'heaven', 'earth', 'armour', 'day', 'armour', 'sun']
['yajnavalkya', 'replied', 'one', 'wishing', 'go', 'long', 'distance', 'emperor', 'secure', 'chariot', 'boat', 'fully', 'equipped', 'mind', 'many', 'secret', 'names', 'brahman']
['wife', 'would', 'yield', 'husband']
['growest', 'upon', 'mountain', 'eagle', 'art', 'sprung', 'himavant', 'come', 'treasures', 'heard', 'fame']
['indra', 'associate', 'priests', 'cleared', 'stable', 'full', 'steeds', 'kine', 'giving', 'thousand', 'eightmarked', 'cars', 'gained', 'renown', 'among', 'gods']
['victory', 'universal', 'dharma']
['keep', 'us', 'safely', 'spiteful', 'curse', 'presumptuous', 'foe']
['let', 'thousands', 'slain', 'may', 'club', 'bhava', 'crush']
['mountain', 'gro

In [None]:
checkresults(Hinduism_sentences_nl)

In [None]:
savesentences('Hinduism_sentences',Hinduism_sentences)

In [None]:
savesentencesnl('Hinduism_sentences',Hinduism_sentences)

## Buddhism

In [None]:
files_Buddhism = ['discorsi_budda.txt','discorsi_budda_2.txt','discorsi_budda_3.txt', 'budda4.txt','budda5.txt', 'BuddhistMonasticTraditionsofSouthernAsia.txt','Buddhacarita.txt', 'Brahmas_Net_Sutra.txt']
Buddhism_sentences = []
Buddhism_sentences_nl = []

for file_name in files_Buddhism:
  Buddhism_sentences = Buddhism_sentences + preprocessing(file_name)
  Buddhism_sentences_nl = Buddhism_sentences_nl + preprocessing_nolemm(file_name)

In [None]:
checkresults(Buddhism_sentences)

Words: 404717
Unique words: 19699
Most common words:
[('one', 5275), ('buddha', 3319), ('see', 2622), ('people', 2216), ('also', 1930), ('world', 1570), ('great', 1547), ('way', 1506), ('three', 1473), ('life', 1470)]
Random sentences:
['sutra', 'queen', 'śrīmālā', 'lion', 'roar', 'ch']
['nevertheless', 'among', 'establish', 'meritorious', 'acts', 'violate', 'precepts', 'minds', 'yet', 'live', 'place', 'hermitage', 'recite', 'many', 'buddhist', 'scriptures', 'chant', 'continually', 'yet', 'smoothly', 'explain', 'doctrines', 'already', 'aged', 'fall', 'three', 'classes', 'mentioned', 'yet', 'whose', 'essential', 'nature', 'pure', 'virtuous', 'serious', 'evil']
['without', 'one', 'good', 'qualities', 'established', 'morality']
['said', 'buddha', 'may', 'request', 'one', 'holiness', 'venerable', 'disciples', 'accept', 'tomorrow', 'meal', 'place']
['vicikitsā']
['turns', 'noble', 'humble', 'upside', 'falsely', 'makes', 'talk']
['seventh', 'sentient', 'beings', 'reside', 'sphere', 'nothingn

In [None]:
checkresults(Buddhism_sentences_nl)

In [None]:
savesentences('Buddhism_sentences', Buddhism_sentences)

In [None]:
savesentencesnl('Buddhism_sentences', Buddhism_sentences)