## Assignment-04 Extraction of Syntactic Translation Models from Parallel Data using Syntax from Source and Target Languages
# Subramanyam Sahoo
### @22mcs107 @NITHamirpur

## Let's do some dependency parsing

In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc1 = nlp(u"The Sun rises in the East.")

In [4]:
displacy.render(doc1,style='dep',jupyter=True)

In [5]:
options={
    'distance':80,
    'compact':True,
    'color':'#fff',
    'bg':'#00a65a'
}

In [6]:

displacy.render(doc1,style='dep',jupyter=True,options=options)

In [7]:
doc1 = nlp("What are the major differences between metals and non-metals?")
displacy.render(doc1,style='dep',jupyter=True)

In [8]:
doc1 = nlp("That was a great match!")
displacy.render(doc1,style='dep',jupyter=True)

In [9]:
doc1 = nlp("Please pick up the notes when you come.")
displacy.render(doc1,style='dep',jupyter=True)

In [10]:
doc1 = nlp("Rahul did not complete his homework, so the teacher punished him.")
displacy.render(doc1,style='dep',jupyter=True)

In [11]:
doc1 = nlp("The children were asked to go home because it was too late.")
displacy.render(doc1,style='dep',jupyter=True)

In [12]:
doc1 = nlp("The man next to the large oak tree near the grocery store on the corner is tall.")
displacy.render(doc1,style='dep',jupyter=True)

In [13]:
doc1 = nlp("The men next to the large oak tree near the grocery store on the corner are tall.")
displacy.render(doc1,style='dep',jupyter=True)

In [14]:
doc1 = nlp("Incessant heavy rainfall causes waterlogging in parts of Delhi.")
displacy.render(doc1,style='dep',jupyter=True)

## Passive Voice Sentences

In [15]:
doc1 = nlp("The baby was kissed on the head by the lady.")
displacy.render(doc1,style='dep',jupyter=True)

In [16]:
doc1 = nlp("The milk was spilled by the boy.")
displacy.render(doc1,style='dep',jupyter=True)

In [17]:
doc1 = nlp("The fish was eaten.")
displacy.render(doc1,style='dep',jupyter=True)

## OR Sentences

In [18]:
doc1 = nlp("(Sometimes he would take care of the whole flock while the shepherd was resting  eating his dinner.")
displacy.render(doc1,style='dep',jupyter=True)

In [19]:
doc1 = nlp("The goat that the pig had bumped near the bush was smiling.")
displacy.render(doc1,style='dep',jupyter=True)

## SVO Structured

In [20]:
doc1 = nlp("The boy smiling is consoling the little girl crying.")
displacy.render(doc1,style='dep',jupyter=True)

## Idioms

In [21]:
doc1 = nlp("the spirit is willing but the flesh is weak.")
displacy.render(doc1,style='dep',jupyter=True)

## ill formed sentence

In [22]:
doc1 = nlp("The train that the knife had helped under the square was cold.")
displacy.render(doc1,style='dep',jupyter=True)

## Bigram & Trigram Extractor

In [23]:
class SNgramExtractor:
    '''
    text:input text
    meta_tag:Resultant bigram and trigram should be concatenated with part of speech tag('pos') or dependency tag('dep') or original SN-gram('original')
    trigram_flag:if we need to include trigrams derived from SN-grams as well ('yes') or not ('no'). Default is 'yes'
w
    '''
    def __init__(self,text,meta_tag,trigram_flag='yes',nlp_model=None):
        self.text=text
        self.meta_tag=meta_tag
        self.trigram_flag=trigram_flag
        if nlp_model:
            self.nlp_model=nlp_model
        else:
            self.nlp_model=nlp

    def get_trigram_element(self,trigram_element):
        return '_'.join([str(element) for element in trigram_element.split('_')[:-1]])

    def get_trigrams(self,left_right_words):
        trigrams=[]
        for i in range(len(left_right_words.values())):
            right=list(left_right_words.values())[i]
            right_value=list(left_right_words.keys())[i]
            if right in left_right_words.keys():
                #if present, find index and the actual word
                left_indx=list(left_right_words.keys()).index(right)
                #right,left key, left value
                left_key=list(left_right_words.keys())[left_indx]
                left_value=left_right_words[left_key]
                trigrams.append(str(self.get_trigram_element(right_value))+'_'+str(self.get_trigram_element(left_key))+'_'+str(self.get_trigram_element(left_value)))
        return ' '.join([str(trigram) for trigram in trigrams])
    
    def get_SNgram(self):
        bigrams=[]
        word_list=[]

        left_right_words={}
        unique_pos={}
        unique_dep={}

        nlp_obj=self.nlp_model(self.text)

        for spacy_element in nlp_obj:
            #no same head and body
            if str(spacy_element.head)+str(spacy_element.head.idx)!=str(spacy_element)+str(spacy_element.idx):
                ##check type of concatenation between head and body with meta attributes
                if self.meta_tag=='dep':
                    bigrams.append(str(spacy_element.head)+'_'+spacy_element.head.dep_+'_'+str(spacy_element)+'_'+spacy_element.dep_)
                    left_right_words[str(spacy_element.head)+'_'+str(spacy_element.head.dep_)+'_'+str(spacy_element.head.idx)]=str(spacy_element)+'_'+str(spacy_element.dep_)+'_'+str(spacy_element.idx)
                elif self.meta_tag=='pos':
                    bigrams.append(str(spacy_element.head)+'_'+spacy_element.head.pos_+'_'+str(spacy_element)+'_'+spacy_element.pos_)
                    left_right_words[str(spacy_element.head)+'_'+str(spacy_element.head.pos_)+'_'+str(spacy_element.head.idx)]=str(spacy_element)+'_'+str(spacy_element.pos_)+'_'+str(spacy_element.idx)
                elif self.meta_tag=='original' or self.meta_tag=='':
                    bigrams.append(str(spacy_element.head)+'_'+str(spacy_element))
                    left_right_words[str(spacy_element.head)+'_'+str(spacy_element.head.idx)]=str(spacy_element)+'_'+str(spacy_element.idx)

        flat_bigrams=' '.join([str(bigram) for bigram in bigrams])
        
        result_dict={}
        result_dict['SNBigram']=flat_bigrams

        if self.trigram_flag=='yes':
            result_dict['SNTrigram']=self.get_trigrams(left_right_words)
            return result_dict
        else:
            return result_dict
        
if __name__=="__main__":
    text='Economic news have little effect on financial markets.every cloud has a silver lining'  
    'What are the major differences between metals and non-metals?'  
    SNgram_obj=SNgramExtractor(text,meta_tag='original',trigram_flag='yes',nlp_model=None)
    output=SNgram_obj.get_SNgram()
    print(text)
    print('SNGram bigram:',output['SNBigram'])
    print('SNGram trigram:',output['SNTrigram'])
    
    print('-----------------------------------')
    text= '''The estate is called Carfax, no doubt a corruption of the old Quatre Face, as
the house is four-sided, agreeing with the cardinal points of the compass. It
contains in all some twenty acres, quite surrounded by the solid stone wall above
mentioned. There are many trees on it, which make it in places gloomy, and
there is a deep, dark-looking pond or small lake, evidently fed by some springs,
as the water is clear and flows away in a fair-sized stream. The house is very
large and of all periods back, I should say, to mediæval times, for one part is of
stone immensely thick, with only a few windows high up and heavily barred
with iron. It looks like part of a keep, and is close to an old chapel or church. I
could not enter it, as I had not the key of the door leading to it from the house,
but I have taken with my kodak views of it from various points. The house has
been added to, but in a very straggling way, and I can only guess at the amount
of ground it covers, which must be very great. There are but few houses close at
hand, one being a very large house only recently added to and formed into a
private lunatic asylum. It is not, however, visible from the grounds.'''
    SNgram_obj=SNgramExtractor(text,meta_tag='original',trigram_flag='yes',nlp_model=None)
    output=SNgram_obj.get_SNgram()
    print(text)
    print('SNGram bigram:',output['SNBigram'])
    print('SNGram trigram:',output['SNTrigram'])

Economic news have little effect on financial markets.every cloud has a silver lining
SNGram bigram: news_Economic have_news effect_little have_effect effect_on markets.every_financial cloud_markets.every on_cloud have_has lining_a lining_silver has_lining
SNGram trigram: have_has_lining effect_on_cloud cloud_markets.every_financial on_cloud_markets.every has_lining_silver
-----------------------------------
The estate is called Carfax, no doubt a corruption of the old Quatre Face, as
the house is four-sided, agreeing with the cardinal points of the compass. It
contains in all some twenty acres, quite surrounded by the solid stone wall above
mentioned. There are many trees on it, which make it in places gloomy, and
there is a deep, dark-looking pond or small lake, evidently fed by some springs,
as the water is clear and flows away in a fair-sized stream. The house is very
large and of all periods back, I should say, to mediæval times, for one part is of
stone immensely thick, with only

In [24]:
import nltk

paragraph =  """The estate is called Carfax, no doubt a corruption of the old Quatre Face, as
the house is four-sided, agreeing with the cardinal points of the compass. It
contains in all some twenty acres, quite surrounded by the solid stone wall above
mentioned. There are many trees on it, which make it in places gloomy, and
there is a deep, dark-looking pond or small lake, evidently fed by some springs,
as the water is clear and flows away in a fair-sized stream. The house is very
large and of all periods back, I should say, to mediæval times, for one part is of
stone immensely thick, with only a few windows high up and heavily barred
with iron. It looks like part of a keep, and is close to an old chapel or church. I
could not enter it, as I had not the key of the door leading to it from the house,
but I have taken with my kodak views of it from various points. The house has
been added to, but in a very straggling way, and I can only guess at the amount
of ground it covers, which must be very great. There are but few houses close at
hand, one being a very large house only recently added to and formed into a
private lunatic asylum. It is not, however, visible from the grounds."""         

In [25]:
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    
# Creating the Bag of Words model i.e document matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [26]:
print (X)

[[0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
  1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0
  0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

## convert text to lower case

In [27]:
dataset = nltk.sent_tokenize(paragraph)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W', ' ', dataset[i])
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

In [28]:
print(dataset)

['the estate is called carfax no doubt a corruption of the old quatre face as the house is four sided agreeing with the cardinal points of the compass ', 'it contains in all some twenty acres quite surrounded by the solid stone wall above mentioned ', 'there are many trees on it which make it in places gloomy and there is a deep dark looking pond or small lake evidently fed by some springs as the water is clear and flows away in a fair sized stream ', 'the house is very large and of all periods back i should say to mediæval times for one part is of stone immensely thick with only a few windows high up and heavily barred with iron ', 'it looks like part of a keep and is close to an old chapel or church ', 'i could not enter it as i had not the key of the door leading to it from the house but i have taken with my kodak views of it from various points ', 'the house has been added to but in a very straggling way and i can only guess at the amount of ground it covers which must be very grea

## Creating the Bag of Words model

In [29]:

word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [30]:
print(word2count)

{'the': 14, 'estate': 1, 'is': 8, 'called': 1, 'carfax': 1, 'no': 1, 'doubt': 1, 'a': 8, 'corruption': 1, 'of': 8, 'old': 2, 'quatre': 1, 'face': 1, 'as': 3, 'house': 5, 'four': 1, 'sided': 1, 'agreeing': 1, 'with': 4, 'cardinal': 1, 'points': 2, 'compass': 1, 'it': 9, 'contains': 1, 'in': 4, 'all': 2, 'some': 2, 'twenty': 1, 'acres': 1, 'quite': 1, 'surrounded': 1, 'by': 2, 'solid': 1, 'stone': 2, 'wall': 1, 'above': 1, 'mentioned': 1, 'there': 3, 'are': 2, 'many': 1, 'trees': 1, 'on': 1, 'which': 2, 'make': 1, 'places': 1, 'gloomy': 1, 'and': 7, 'deep': 1, 'dark': 1, 'looking': 1, 'pond': 1, 'or': 2, 'small': 1, 'lake': 1, 'evidently': 1, 'fed': 1, 'springs': 1, 'water': 1, 'clear': 1, 'flows': 1, 'away': 1, 'fair': 1, 'sized': 1, 'stream': 1, 'very': 4, 'large': 2, 'periods': 1, 'back': 1, 'i': 5, 'should': 1, 'say': 1, 'to': 5, 'mediæval': 1, 'times': 1, 'for': 1, 'one': 2, 'part': 2, 'immensely': 1, 'thick': 1, 'only': 3, 'few': 2, 'windows': 1, 'high': 1, 'up': 1, 'heavily': 1, '

## show frequent words

In [31]:
import heapq
freq_words = heapq.nlargest(1000, word2count, key=word2count.get)

In [32]:
print(freq_words)

['the', 'it', 'is', 'a', 'of', 'and', 'house', 'i', 'to', 'with', 'in', 'very', 'as', 'there', 'only', 'not', 'from', 'but', 'old', 'points', 'all', 'some', 'by', 'stone', 'are', 'which', 'or', 'large', 'one', 'part', 'few', 'close', 'added', 'at', 'estate', 'called', 'carfax', 'no', 'doubt', 'corruption', 'quatre', 'face', 'four', 'sided', 'agreeing', 'cardinal', 'compass', 'contains', 'twenty', 'acres', 'quite', 'surrounded', 'solid', 'wall', 'above', 'mentioned', 'many', 'trees', 'on', 'make', 'places', 'gloomy', 'deep', 'dark', 'looking', 'pond', 'small', 'lake', 'evidently', 'fed', 'springs', 'water', 'clear', 'flows', 'away', 'fair', 'sized', 'stream', 'periods', 'back', 'should', 'say', 'mediæval', 'times', 'for', 'immensely', 'thick', 'windows', 'high', 'up', 'heavily', 'barred', 'iron', 'looks', 'like', 'keep', 'an', 'chapel', 'church', 'could', 'enter', 'had', 'key', 'door', 'leading', 'have', 'taken', 'my', 'kodak', 'views', 'various', 'has', 'been', 'straggling', 'way', 'ca

In [33]:
import numpy as np
X = []
for data in dataset:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)
X = np.asarray(X)

In [34]:
print (X)

[[1 0 1 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]]


In [35]:
pip install textblob


Note: you may need to restart the kernel to use updated packages.


In [36]:
import textblob
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

blob = TextBlob('TextBlob is a great tool for developers')
print(blob.translate(to='hi'))

In [37]:
my_sentence = TextBlob(paragraph)

## Finding the Tagset 

In [38]:
my_sentence.tags

[('The', 'DT'),
 ('estate', 'NN'),
 ('is', 'VBZ'),
 ('called', 'VBN'),
 ('Carfax', 'NNP'),
 ('no', 'DT'),
 ('doubt', 'NN'),
 ('a', 'DT'),
 ('corruption', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('old', 'JJ'),
 ('Quatre', 'NNP'),
 ('Face', 'NNP'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('house', 'NN'),
 ('is', 'VBZ'),
 ('four-sided', 'JJ'),
 ('agreeing', 'VBG'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('cardinal', 'JJ'),
 ('points', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('compass', 'NN'),
 ('It', 'PRP'),
 ('contains', 'VBZ'),
 ('in', 'IN'),
 ('all', 'DT'),
 ('some', 'DT'),
 ('twenty', 'CD'),
 ('acres', 'NNS'),
 ('quite', 'RB'),
 ('surrounded', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('solid', 'JJ'),
 ('stone', 'NN'),
 ('wall', 'NN'),
 ('above', 'IN'),
 ('mentioned', 'VBN'),
 ('There', 'EX'),
 ('are', 'VBP'),
 ('many', 'JJ'),
 ('trees', 'NNS'),
 ('on', 'IN'),
 ('it', 'PRP'),
 ('which', 'WDT'),
 ('make', 'VBP'),
 ('it', 'PRP'),
 ('in', 'IN'),
 ('places', 'NNS'),
 ('gloomy', 'JJ'),
 ('and', 'CC'),
 ('

## calculating total number of words

In [39]:
my_sentence.words

WordList(['The', 'estate', 'is', 'called', 'Carfax', 'no', 'doubt', 'a', 'corruption', 'of', 'the', 'old', 'Quatre', 'Face', 'as', 'the', 'house', 'is', 'four-sided', 'agreeing', 'with', 'the', 'cardinal', 'points', 'of', 'the', 'compass', 'It', 'contains', 'in', 'all', 'some', 'twenty', 'acres', 'quite', 'surrounded', 'by', 'the', 'solid', 'stone', 'wall', 'above', 'mentioned', 'There', 'are', 'many', 'trees', 'on', 'it', 'which', 'make', 'it', 'in', 'places', 'gloomy', 'and', 'there', 'is', 'a', 'deep', 'dark-looking', 'pond', 'or', 'small', 'lake', 'evidently', 'fed', 'by', 'some', 'springs', 'as', 'the', 'water', 'is', 'clear', 'and', 'flows', 'away', 'in', 'a', 'fair-sized', 'stream', 'The', 'house', 'is', 'very', 'large', 'and', 'of', 'all', 'periods', 'back', 'I', 'should', 'say', 'to', 'mediæval', 'times', 'for', 'one', 'part', 'is', 'of', 'stone', 'immensely', 'thick', 'with', 'only', 'a', 'few', 'windows', 'high', 'up', 'and', 'heavily', 'barred', 'with', 'iron', 'It', 'looks

## Calculating total number of sentences

In [40]:
my_sentence.sentences

[Sentence("The estate is called Carfax, no doubt a corruption of the old Quatre Face, as
 the house is four-sided, agreeing with the cardinal points of the compass."),
 Sentence("It
 contains in all some twenty acres, quite surrounded by the solid stone wall above
 mentioned."),
 Sentence("There are many trees on it, which make it in places gloomy, and
 there is a deep, dark-looking pond or small lake, evidently fed by some springs,
 as the water is clear and flows away in a fair-sized stream."),
 Sentence("The house is very
 large and of all periods back, I should say, to mediæval times, for one part is of
 stone immensely thick, with only a few windows high up and heavily barred
 with iron."),
 Sentence("It looks like part of a keep, and is close to an old chapel or church."),
 Sentence("I
 could not enter it, as I had not the key of the door leading to it from the house,
 but I have taken with my kodak views of it from various points."),
 Sentence("The house has
 been added to, but 

## Lemmatization

In [41]:
from textblob import Word
w = Word(paragraph)
w.lemmatize()

'The estate is called Carfax, no doubt a corruption of the old Quatre Face, as\nthe house is four-sided, agreeing with the cardinal points of the compass. It\ncontains in all some twenty acres, quite surrounded by the solid stone wall above\nmentioned. There are many trees on it, which make it in places gloomy, and\nthere is a deep, dark-looking pond or small lake, evidently fed by some springs,\nas the water is clear and flows away in a fair-sized stream. The house is very\nlarge and of all periods back, I should say, to mediæval times, for one part is of\nstone immensely thick, with only a few windows high up and heavily barred\nwith iron. It looks like part of a keep, and is close to an old chapel or church. I\ncould not enter it, as I had not the key of the door leading to it from the house,\nbut I have taken with my kodak views of it from various points. The house has\nbeen added to, but in a very straggling way, and I can only guess at the amount\nof ground it covers, which must 

## parsing is done using TextBlob package in Anaconda env. @jupyter notebook

In [42]:
my_sentence.parse()

'The/DT/B-NP/O estate/NN/I-NP/O is/VBZ/B-VP/O called/VBN/I-VP/O Carfax/NNP/B-NP/O ,/,/O/O no/DT/B-NP/O doubt/NN/I-NP/O a/DT/B-NP/O corruption/NN/I-NP/O of/IN/B-PP/B-PNP the/DT/B-NP/I-PNP old/JJ/I-NP/I-PNP Quatre/NNP/I-NP/I-PNP Face/NNP/I-NP/I-PNP ,/,/O/O as/IN/B-PP/B-PNP the/DT/B-NP/I-PNP house/NN/I-NP/I-PNP is/VBZ/B-VP/O four-sided/JJ/B-ADJP/O ,/,/O/O agreeing/VBG/B-VP/O with/IN/B-PP/B-PNP the/DT/B-NP/I-PNP cardinal/JJ/I-NP/I-PNP points/NNS/I-NP/I-PNP of/IN/B-PP/B-PNP the/DT/B-NP/I-PNP compass/NN/I-NP/I-PNP ././O/O\nIt/PRP/B-NP/O contains/VBZ/B-VP/O in/IN/B-PP/B-PNP all/DT/B-NP/I-PNP some/DT/I-NP/I-PNP twenty/CD/I-NP/I-PNP acres/NNS/I-NP/I-PNP ,/,/O/O quite/RB/B-VP/O surrounded/VBN/I-VP/O by/IN/B-PP/B-PNP the/DT/B-NP/I-PNP solid/JJ/I-NP/I-PNP stone/NN/I-NP/I-PNP wall/NN/I-NP/I-PNP above/IN/B-PP/B-PNP mentioned/VBN/B-VP/I-PNP ././O/O\nThere/EX/O/O are/VBP/B-VP/O many/JJ/B-NP/O trees/NNS/I-NP/O on/IN/B-PP/B-PNP it/PRP/B-NP/I-PNP ,/,/O/O which/WDT/O/O make/VB/B-VP/O it/PRP/B-NP/O in/IN/B

In [43]:
pip install English-to-Hindi

Note: you may need to restart the kernel to use updated packages.


In [44]:
pip install corpora

Note: you may need to restart the kernel to use updated packages.


In [45]:
from nltk.corpus import indian

nltk.corpus.indian.words('hindi.pos')

['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]

# CODING IS DONE OVER MY PARALLEL CORPORA

In [46]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import math

In [47]:
textenglish=open('englishtest.txt')
english=textenglish.read()
print(english)

Fresh breath and shining teeth enhance your personality .
Your self-confidence also increases with teeth .
Bacteria stay between our gums and teeth .
They make teeth dirty and breath stinky .
You may keep your teeth clean and breath fresh by the help of some easy tips given here .
Clean your teeth properly .
It takes two to three minutes to clean your teeth properly .
But most of the people give less than one minute for this .
Drink plenty of water .
Bacteria attack fast if the mouth dries up .
With this stink comes from breath .
By drinking plenty of water not only the left-over pieces of food gets cleaned but saliva also gets formed .
Saliva has important role in keeping the mouth clean .
Saliva destroys those bacteria which create stink in breath .
Chew the sugar-free chewing gum .
Saliva is formed by chewing the chewing gum .
Chewing gum helps in keeping the teeth clean .
Sugared chewing gum is not supposed to be good for health .
That is why dentists do not suggest chewing sugared

In [48]:
hinditext=open('hinditest.txt' , encoding = "utf-8")
hindi=hinditext.read()
print(hindi)

ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।
दाँतों से आपका आत्मविश्‍वास भी बढ़ता है ।
हमारे मसूढ़ों और दाँतों के बीच बैक्टीरिया मौजूद होते हैं ।
ये दाँतों को गंदा और साँसों को बदबूदार बना देते हैं ।
यहाँ दिए कुछ आसान नुस्खों की मदद से आप अपने दाँतों को स्वच्छ और साँसों को ताजा रख सकते हैं ।
दाँतों को ठीक से साफ करें ।
दाँतों को ठीक से साफ करने में दो से तीन मिनट का समय लगता है ।
लेकिन ज्यादातर लोग इसके लिए एक मिनट से भी कम समय देते हैं ।
खूब पानी पीएँ ।
मुँह सूखने पर बैक्टीरिया हमला तेज कर देते हैं ।
इससे साँसों से बदबू आने लगती है ।
खूब पानी पीने से न केवल खाने के बचे - खुचे टुकड़े साफ हो जाते हैं , बल्कि लार भी बनती है ।
मुँह साफ रखने में लार की खास भूमिका होती है ।
लार उन बैक्टीरिया को नष्ट करती है जो साँसों में बदबू पैदा करते हैं ।
चबाएँ शुगर रहित चुइंग गम ।
चुइंग गम चबाने से लार बनती है ।
चुइंग गम से दाँतों को साफ रखने में मदद मिलती है ।
शुगर युक्त गम को सेहत के लिए अच्छा नहीं माना जाता ।
इसलिए डेंटिस्ट शुगर युक्त गम को खाने की सलाह नहीं देते ।
नियमित रूप से कराएँ

### depedency parsing of 1st sentence in Hindi Corpora

In [49]:
doc1 = nlp("ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।.")
displacy.render(doc1,style='dep',jupyter=True)

## Creating Frequency Table
### Now, we’ll create a frequency table which is a dictionary having words as keys and their frequency or number of times that word have appeared in tboth Hindi and English Corpora as value

In [50]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [51]:
print(stop_words) #english stopwords

{'weren', 'have', 'wouldn', 't', 'they', 'same', 'until', 'mustn', "shouldn't", "hadn't", 'most', 'itself', 'at', 'off', 'doesn', "you'd", 'with', 'you', 'of', 'in', 'few', 'had', 'should', 'ain', 'having', 'as', 'isn', 'mightn', 'other', 's', 'just', 'no', 'o', 'my', "wouldn't", 'when', 'am', 'above', "doesn't", 'm', 'by', 'further', 'very', 'because', 'ours', "isn't", 'yourself', 'her', 'who', 'each', 'if', "weren't", "it's", 'which', 'was', 'under', 'me', 'be', "you're", 'how', 'your', "needn't", 'any', 'an', "aren't", 'ma', 'this', 'theirs', 'into', "don't", 'won', 'are', "should've", 'from', 'then', 'the', "you'll", 'been', 'yourselves', 'shan', 'and', "you've", "she's", 'd', 'after', "wasn't", 'will', 'while', 'again', 'nor', "hasn't", 'myself', 'is', 'haven', "haven't", 'do', 'has', 'there', 'yours', 'it', 'their', 'a', "didn't", 'aren', 'only', "mustn't", 'about', 'between', 'these', 'were', 'did', "that'll", 'such', 'where', 'not', 'so', 'hasn', 'whom', 'don', 'now', 've', 'hi

In [52]:
def createfrequencytable(text_string) -> dict:
    stopWords = set(stop_words)
    words = word_tokenize(text_string)
    ps = PorterStemmer()
   
    freqTable = dict()
    for word in words:
        word=str(word)
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
    return freqTable
ft=createfrequencytable(english)
print(ft)

{'fresh': 2, 'breath': 6, 'shine': 2, 'teeth': 14, 'enhanc': 1, 'person': 3, '.': 213, 'self-confid': 1, 'also': 7, 'increas': 4, 'bacteria': 3, 'stay': 1, 'gum': 6, 'make': 4, 'dirti': 1, 'stinki': 1, 'may': 4, 'keep': 11, 'clean': 11, 'help': 3, 'easi': 2, 'tip': 1, 'given': 3, 'properli': 2, 'take': 12, 'two': 10, 'three': 5, 'minut': 2, 'peopl': 3, 'give': 2, 'less': 7, 'one': 11, 'thi': 33, 'drink': 4, 'plenti': 3, 'water': 16, 'attack': 2, 'fast': 2, 'mouth': 4, 'dri': 1, 'stink': 3, 'come': 6, 'onli': 14, 'left-ov': 2, 'piec': 2, 'food': 10, 'get': 14, 'saliva': 4, 'form': 3, 'ha': 3, 'import': 3, 'role': 1, 'destroy': 1, 'creat': 1, 'chew': 8, 'sugar-fre': 1, 'sugar': 4, 'suppos': 1, 'good': 1, 'health': 2, 'whi': 3, 'dentist': 2, 'suggest': 2, 'checked-up': 3, 'regularli': 3, 'solv': 1, 'small': 4, 'problem': 1, 'easili': 1, 'meal': 1, 'everi': 5, 'time': 7, 'eat': 6, 'mixtur': 4, 'lemon': 2, 'salt': 6, 'spoon': 2, 'pour': 1, 'four': 3, 'drop': 1, 'juic': 1, 'week': 1, 'start'

In [53]:
stop=open('Hindistopwords.txt' , encoding = "utf-8")
stopwords=[]
for x in stop:
   stopwords.append(x)

print(stop)
print(stopwords)

<_io.TextIOWrapper name='Hindistopwords.txt' mode='r' encoding='utf-8'>
['अत\n', 'अपना\n', 'अपनी\n', 'अपने\n', 'अभी\n', 'अंदर\n', 'आदि\n', 'आप\n', 'इत्यादि\n', 'इन \n', 'इनका\n', 'इन्हीं\n', 'इन्हें\n', 'इन्हों\n', 'इस\n', 'इसका\n', 'इसकी\n', 'इसके\n', 'इसमें\n', 'इसी\n', 'इसे\n', 'उन\n', 'उनका\n', 'उनकी\n', 'उनके\n', 'उनको\n', 'उन्हीं\n', 'उन्हें\n', 'उन्हों\n', 'उस\n', 'उसके\n', 'उसी\n', 'उसे\n', 'एक\n', 'एवं\n', 'एस\n', 'ऐसे\n', 'और\n', 'कई\n', 'कर\n', 'करता\n', 'करते\n', 'करना\n', 'करने\n', 'करें\n', 'कहते\n', 'कहा\n', 'का\n', 'काफ़ी\n', 'कि\n', 'कितना\n', 'किन्हें\n', 'किन्हों\n', 'किया\n', 'किर\n', 'किस\n', 'किसी\n', 'किसे\n', 'की\n', 'कुछ\n', 'कुल\n', 'के\n', 'को\n', 'कोई\n', 'कौन\n', 'कौनसा\n', 'गया\n', 'घर\n', 'जब\n', 'जहाँ\n', 'जा\n', 'जितना\n', 'जिन\n', 'जिन्हें\n', 'जिन्हों\n', 'जिस\n', 'जिसे\n', 'जीधर\n', 'जैसा\n', 'जैसे\n', 'जो\n', 'तक\n', 'तब\n', 'तरह\n', 'तिन\n', 'तिन्हें\n', 'तिन्हों\n', 'तिस\n', 'तिसे\n', 'तो\n', 'था\n', 'थी\n', 'थे\n', 'दबारा\n', 'दिया\n', 'दुसरा\n', 

In [54]:
def createfrequencytable(text_string) -> dict:
    stopWords = set(stopwords)
    words = word_tokenize(text_string)
    ps = PorterStemmer()
   
    freqTable = dict()
    for word in words:
        word=str(word)
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
    return freqTable
ft=createfrequencytable(hindi)
print(ft)

{'ताजा': 2, 'साँसें': 2, 'और': 29, 'चमचमाते': 1, 'दाँत': 4, 'आपके': 1, 'व्यक्तित्व': 1, 'को': 37, 'निखारते': 1, 'हैं': 33, '।': 186, 'दाँतों': 11, 'से': 87, 'आपका': 2, 'आत्मविश्\u200dवास': 1, 'भी': 15, 'बढ़ता': 1, 'है': 62, 'हमारे': 1, 'मसूढ़ों': 1, 'के': 76, 'बीच': 5, 'बैक्टीरिया': 3, 'मौजूद': 1, 'होते': 1, 'ये': 1, 'गंदा': 1, 'साँसों': 5, 'बदबूदार': 1, 'बना': 3, 'देते': 6, 'यहाँ': 5, 'दिए': 2, 'कुछ': 9, 'आसान': 1, 'नुस्खों': 1, 'की': 57, 'मदद': 3, 'आप': 13, 'अपने': 9, 'स्वच्छ': 1, 'रख': 2, 'सकते': 7, 'ठीक': 2, 'साफ': 14, 'करें': 17, 'करने': 7, 'में': 69, 'दो': 9, 'तीन': 5, 'मिनट': 2, 'का': 36, 'समय': 3, 'लगता': 2, 'लेकिन': 12, 'ज्यादातर': 1, 'लोग': 3, 'इसके': 3, 'लिए': 14, 'एक': 19, 'कम': 13, 'खूब': 2, 'पानी': 16, 'पीएँ': 1, 'मुँह': 4, 'सूखने': 1, 'पर': 23, 'हमला': 1, 'तेज': 1, 'कर': 9, 'इससे': 6, 'बदबू': 2, 'आने': 1, 'लगती': 2, 'पीने': 1, 'न': 9, 'केवल': 2, 'खाने': 8, 'बचे': 2, '-': 13, 'खुचे': 1, 'टुकड़े': 2, 'हो': 22, 'जाते': 4, ',': 43, 'बल्कि': 2, 'लार': 4, 'बनती': 2, 'रखने': 2,




## tokenization of HINDI sentences

In [55]:

text=open('hinditest.txt' , encoding = "utf-8")
sentences = sent_tokenize(text.read()) # NLTK function
total_documents = len(sentences)
print(sentences)
print(total_documents)

['ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।\nदाँतों से आपका आत्मविश्\u200dवास भी बढ़ता है ।\nहमारे मसूढ़ों और दाँतों के बीच बैक्टीरिया मौजूद होते हैं ।\nये दाँतों को गंदा और साँसों को बदबूदार बना देते हैं ।\nयहाँ दिए कुछ आसान नुस्खों की मदद से आप अपने दाँतों को स्वच्छ और साँसों को ताजा रख सकते हैं ।\nदाँतों को ठीक से साफ करें ।\nदाँतों को ठीक से साफ करने में दो से तीन मिनट का समय लगता है ।\nलेकिन ज्यादातर लोग इसके लिए एक मिनट से भी कम समय देते हैं ।\nखूब पानी पीएँ ।\nमुँह सूखने पर बैक्टीरिया हमला तेज कर देते हैं ।\nइससे साँसों से बदबू आने लगती है ।\nखूब पानी पीने से न केवल खाने के बचे - खुचे टुकड़े साफ हो जाते हैं , बल्कि लार भी बनती है ।\nमुँह साफ रखने में लार की खास भूमिका होती है ।\nलार उन बैक्टीरिया को नष्ट करती है जो साँसों में बदबू पैदा करते हैं ।\nचबाएँ शुगर रहित चुइंग गम ।\nचुइंग गम चबाने से लार बनती है ।\nचुइंग गम से दाँतों को साफ रखने में मदद मिलती है ।\nशुगर युक्त गम को सेहत के लिए अच्छा नहीं माना जाता ।\nइसलिए डेंटिस्ट शुगर युक्त गम को खाने की सलाह नहीं द

In [56]:
pip install giza

Note: you may need to restart the kernel to use updated packages.


# BLEU score @ Sentence Level
## using Pytorch moule

In [57]:
pip install torchmetrics

Note: you may need to restart the kernel to use updated packages.


In [58]:
>>> from torchmetrics.functional import bleu_score
>>> preds = ['the cat is on the mat']
>>> target = [['there is a cat on the mat', 'a cat is on the mat']]
>>> bleu_score(preds, target)

tensor(0.7598)

# METEOR Score @Sentrence Level

In [59]:
import sys
from nltk.translate.meteor_score import single_meteor_score

In [60]:
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party']

In [61]:
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands']

In [62]:
>>> print(single_meteor_score) ## user_readability is not smooth.

<function single_meteor_score at 0x000002313719BCA0>


In [63]:
single_meteor_score(reference1,hypothesis1) #Actual Score

0.6944444444444445

In [64]:
def compute_perplexity(log_likelihoods, seq_lens):
    """Computes a MC estimate of perplexity per word based on given likelihoods/ELBO.

    Args:
        log_likelihoods(list of float): likelihood or ELBO from N runs over the same data.
        seq_lens(list of int): the length of sequences in the data, for computing an average.

    Returns:
        perplexity(float): perplexity per word of the data.
        variance(float): variance of the log_likelihoods/ELBO that were used to compute the estimate.
    """
    # Compute perplexity per word and variance of perplexities in the samples
    perplexity = np.exp(np.array(log_likelihoods).mean() / np.array(seq_lens).mean())
    if len(log_likelihoods) > 1:
        variance = np.array(log_likelihoods).mean(axis=1).std(ddof=1)
    else:
        variance = 0.0

    return perplexity, variance 

In [65]:
pip install modules

Note: you may need to restart the kernel to use updated packages.


In [66]:
import modules

In [67]:
pip install pke-tool

Note: you may need to restart the kernel to use updated packages.


# Word Alignment Using HIDDEN MARKOV MODEL

## 1st sentence of both English - Hindi monolingual test corpora is being used

In [68]:
import random
src_words=["Fresh", "breath", "and" ,"shining", "teeth" , "enhance" , "your" , "personality" ]
trg_words=["ताजा", "साँसें" "और"  "चमचमाते", "दाँत " ,"आपके" , "व्यक्तित्व" "को " , " निखारते " , "हैं" ]
def match_indexes(word1,word2):
    return random.random() #adjust this to get the actual correlation value

all_pairs_vals=[] #list for all the source (src) and taget (trg) indexes and the corresponding correlation values
for i in range(len(src_words)): #iterate over src  indexes
    src_word=src_words[i] #identify the correponding src word
    for j in range(len(trg_words)): #iterate over trg indexes
        trg_word=trg_words[j] #identify the correponding trg word
        val=match_indexes(src_word,trg_word) #get the matching value from the inverted indexes of     each word (or from the data provided in the speadsheet)
        all_pairs_vals.append((i,j,val)) #add the sentence indexes for scr and trg, and the corresponding val

all_pairs_vals.sort(key=lambda x:-x[-1])  #sort the list in descending order, to get the pairs with the highest correlation first
selected_alignments=[]
used_i,used_j=[],[] #exclude the used rows and column indexes
for i0,j0,val0 in all_pairs_vals:
    if i0 in used_i: continue #if the current column index i0 has been used before, exclude current pair-value
    if j0 in used_j: continue #same if the current row was used before
    selected_alignments.append((i0,j0)) #otherwise, add the current pair to the final alignment point selection
    used_i.append(i0) #and include it in the used row and column indexes so that it will not be used again
    used_j.append(j0)

for a in all_pairs_vals: #list all pairs and indicate which ones were selected
    i0,j0,val0=a
    if (i0,j0) in selected_alignments: print(a, "<<<<")
    else: print(a)

(1, 0, 0.9962125727903038) <<<<
(7, 6, 0.9648596385013456) <<<<
(6, 6, 0.9522626140491213)
(4, 4, 0.8680160580389263) <<<<
(0, 0, 0.8677092558734509)
(0, 4, 0.8663006569524475)
(4, 1, 0.8509730846009201)
(6, 5, 0.8306270432051962) <<<<
(5, 0, 0.8041771265201703)
(3, 4, 0.7956470893500623)
(5, 4, 0.752378564790138)
(7, 4, 0.7481401748811771)
(6, 2, 0.7450512706576652)
(2, 0, 0.7414942196620594)
(7, 3, 0.7124178321428365)
(7, 2, 0.7079657751672261)
(6, 4, 0.6739798507241247)
(5, 6, 0.6523935558812793)
(1, 6, 0.6462660690119326)
(0, 3, 0.6320038347501294) <<<<
(0, 2, 0.604255523592303)
(5, 1, 0.5830849455568048) <<<<
(3, 6, 0.5740617101046116)
(2, 5, 0.5213839761670896)
(3, 3, 0.5164023958670594)
(3, 1, 0.49909581923621493)
(7, 1, 0.47734692872326345)
(2, 4, 0.4739435009247295)
(1, 3, 0.44521635125657866)
(2, 6, 0.42782603736814584)
(2, 1, 0.3920595587556249)
(6, 1, 0.38692400106912084)
(4, 5, 0.3842624629781046)
(1, 1, 0.3751778075420965)
(0, 5, 0.3574259135928455)
(6, 3, 0.3454520493261

## Indic lang. Corpora

In [69]:
pip install inltk




In [70]:
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()



In [71]:
pip install indic-nlp-library

Note: you may need to restart the kernel to use updated packages.


In [72]:
from indicnlp.tokenize import sentence_tokenize

indic_string="""ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।
दाँतों से आपका आत्मविश्‍वास भी बढ़ता है ।
हमारे मसूढ़ों और दाँतों के बीच बैक्टीरिया मौजूद होते हैं ।
ये दाँतों को गंदा और साँसों को बदबूदार बना देते हैं ।
यहाँ दिए कुछ आसान नुस्खों की मदद से आप अपने दाँतों को स्वच्छ और साँसों को ताजा रख सकते हैं ।
"""

# Split the sentence, language code "hi" is passed for hingi
sentences=sentence_tokenize.sentence_split(indic_string, lang='hi')

# print the sentences
for t in sentences:
    print(t)

ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।
दाँतों से आपका आत्मविश्‍वास भी बढ़ता है ।
हमारे मसूढ़ों और दाँतों के बीच बैक्टीरिया मौजूद होते हैं ।
ये दाँतों को गंदा और साँसों को बदबूदार बना देते हैं ।
यहाँ दिए कुछ आसान नुस्खों की मदद से आप अपने दाँतों को स्वच्छ और साँसों को ताजा रख सकते हैं ।


#  ^_^ Thank You
# Author: Subramanyam Sahoo