# Vector Representation and Rule-Base matching

## Task 1

In [12]:
#!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     ------------------------------------ 777.4/777.4 MB 613.8 kB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [13]:
import spacy
from spacy.lang.en import English
nlp=spacy.load('en_core_web_lg')
import collections
from typing import Dict,List,Tuple

In [14]:
def text2bow(words:List[str],dictionary:Dict[str,str])->List[Tuple[int,int]]:
    word_frequencies=collections.defaultdict(int)
    for word in words:
        if word not in dictionary:
            dictionary[word]=len(dictionary)
        word_frequencies[dictionary[word]]+=1
    return list(word_frequencies.items())

Task a)

In [15]:
sample_text=['Review 1: This movie is very scary and long','Review 2: This movie is not scary and is slow','Review 3: This movie is spooky and good']
for sample in sample_text:
    dictionary={}
    print(sample,'\n',text2bow(sample.split(),dictionary))
    print('\n')

Review 1: This movie is very scary and long 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


Review 2: This movie is not scary and is slow 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)]


Review 3: This movie is spooky and good 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]




statement1 : Review=(0,1),scary=(6,1)                                                    
statement2 : Review=(0,1),scary=(6,1)                                                    
statement3 : Review=(0,1)                                                                

Task 1b)

In [16]:
sample_text=['Review 1: This movie is very scary and long','Review 2: This movie is not scary and is slow','Review 3: This movie is spooky and good']
for sample in sample_text:
    dictionary={}
    text2bow(sample.split(),dictionary)
    print(sample,'\n',dictionary)
    print('\n')

Review 1: This movie is very scary and long 
 {'Review': 0, '1:': 1, 'This': 2, 'movie': 3, 'is': 4, 'very': 5, 'scary': 6, 'and': 7, 'long': 8}


Review 2: This movie is not scary and is slow 
 {'Review': 0, '2:': 1, 'This': 2, 'movie': 3, 'is': 4, 'not': 5, 'scary': 6, 'and': 7, 'slow': 8}


Review 3: This movie is spooky and good 
 {'Review': 0, '3:': 1, 'This': 2, 'movie': 3, 'is': 4, 'spooky': 5, 'and': 6, 'good': 7}




## Task 2

In [17]:
from spacy.matcher import Matcher
matcher=Matcher(nlp.vocab)
pattern=[{'LOWER':'hey'},{'LOWER':'siri'}]
matcher.add('HeySiri',[pattern])
doc=nlp("Hey, Siri! Hey Siri!")
matches= matcher(doc)
for match_id,start,end in matches:
    sting_id=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(span.text)

Hey Siri


In [18]:
matcher=Matcher(nlp.vocab)
pattern=[{'LOWER':'hey'},{'IS_PUNCT':True},{'LOWER':'siri'}]
matcher.add('HeySiri',[pattern])
doc=nlp("Hey, Siri! Hey Siri!")
matches= matcher(doc)
for match_id,start,end in matches:
    sting_id=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(span.text)

Hey, Siri


## Task 3

In [19]:
nlp = spacy.load("en_core_web_lg") 
doc=nlp('apple orange pikkstn german')
for token in doc:
    print('Text=',token.text,',Vector=',token.has_vector,',OOV=',token.is_oov)

Text= apple ,Vector= True ,OOV= False
Text= orange ,Vector= True ,OOV= False
Text= pikkstn ,Vector= False ,OOV= True
Text= german ,Vector= True ,OOV= False


They all are the part of the vocabolary

## Task 4

In [20]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["ROTTEN mangoes", "sweet oranges"]]
matcher.add("Fruit", patterns)
doc = nlp("Do not put rotten mangoes and sweet oranges together.")
for match_id, start, end in matcher(doc):
    print(doc[start:end])

rotten mangoes
sweet oranges


## Task 5

In [21]:
doc=nlp('I prefer the morning flight through Denmark')
for token1 in doc:                                                      
    for token2 in doc:                                                 
        print(token1.text, token2.text, token1.similarity(token2))

I I 1.0
I prefer 0.41702595353126526
I the 0.3116089701652527
I morning 0.3523589074611664
I flight 0.19887085258960724
I through 0.28335073590278625
I Denmark 0.29523220658302307
prefer I 0.41702595353126526
prefer prefer 1.0
prefer the 0.3199503719806671
prefer morning 0.22616012394428253
prefer flight 0.15512840449810028
prefer through 0.23834623396396637
prefer Denmark 0.06000184267759323
the I 0.3116089701652527
the prefer 0.3199503719806671
the the 1.0
the morning 0.3890417218208313
the flight 0.25840193033218384
the through 0.6090923547744751
the Denmark -0.028872722759842873
morning I 0.3523589074611664
morning prefer 0.22616012394428253
morning the 0.3890417218208313
morning morning 1.0
morning flight 0.36576882004737854
morning through 0.3871423304080963
morning Denmark 0.002022282686084509
flight I 0.19887085258960724
flight prefer 0.15512840449810028
flight the 0.25840193033218384
flight morning 0.36576882004737854
flight flight 1.0
flight through 0.2913546860218048
flight 

## Task 6

In [22]:
doc=nlp('Do not put rotten mangoes and sweet oranges together.')
for token1 in doc:                                                      
    for token2 in doc:                                                 
        print(token1.text, token2.text, token1.similarity(token2))

Do Do 1.0
Do not 0.7205888032913208
Do put 0.6295328140258789
Do rotten 0.23457235097885132
Do mangoes 0.07004779577255249
Do and 0.40728652477264404
Do sweet 0.27546998858451843
Do oranges 0.184742733836174
Do together 0.450131356716156
Do . 0.3056337237358093
not Do 0.7205888032913208
not not 1.0
not put 0.6083958745002747
not rotten 0.2806089520454407
not mangoes 0.10161229223012924
not and 0.5304263234138489
not sweet 0.3388059437274933
not oranges 0.17835381627082825
not together 0.4486272633075714
not . 0.4248487055301666
put Do 0.6295328140258789
put not 0.6083958745002747
put put 1.0
put rotten 0.31637996435165405
put mangoes 0.12202073633670807
put and 0.49793902039527893
put sweet 0.3838925361633301
put oranges 0.22029347717761993
put together 0.6148674488067627
put . 0.390465646982193
rotten Do 0.23457235097885132
rotten not 0.2806089520454407
rotten put 0.31637996435165405
rotten rotten 1.0
rotten mangoes 0.3282413184642792
rotten and 0.20274914801120758
rotten sweet 0.3582

Task 6 a)

In [24]:
for token in doc:
    print('Text=',token.text,',Vector=',token.has_vector,',OOV=',token.is_oov)

Text= Do ,Vector= True ,OOV= False
Text= not ,Vector= True ,OOV= False
Text= put ,Vector= True ,OOV= False
Text= rotten ,Vector= True ,OOV= False
Text= mangoes ,Vector= True ,OOV= False
Text= and ,Vector= True ,OOV= False
Text= sweet ,Vector= True ,OOV= False
Text= oranges ,Vector= True ,OOV= False
Text= together ,Vector= True ,OOV= False
Text= . ,Vector= True ,OOV= False


Task 6 b)

In [25]:
for token1 in doc:
    if token1.text=='mangoes':
        for token2 in doc:
            if token2.text=='oranges':
                print(token1.text, token2.text, token1.similarity(token2))

mangoes oranges 0.7255765795707703


Task 6 c)

In [26]:
for token1 in doc:
    if token1.text=='sweet':
        for token2 in doc:
            if token2.text=='oranges':
                print(token1.text, token2.text, token1.similarity(token2))

sweet oranges 0.4652591049671173
