## VECTOR REPRESENTATION AND RULE BASED MATCHING

In [18]:
#importing libraries
import spacy                                                 
from spacy.lang.en import English                            
nlp = spacy.load("en_core_web_sm")  
nlp1 = spacy.load("en_core_web_lg") 
import collections
from typing import Dict, List, Tuple     
from spacy.matcher import Matcher 

### TASK 1

In [2]:
def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:                  
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                                    
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items()) 

In [25]:
#1st sentence                                                             
sample_text = "Review 1:This movie is very scary Review 2:This movie is not scary and is slow Review 3:This movie is spooky and good" 
dictionary = {}                                                                                      
print('Input Text:\n',sample_text)
print('\nBOW Vector Representation: \n', text2bow(sample_text.split(), dictionary))                                                                                          
print('\nDictionary Values: \n', dictionary)                                                                

Input Text:
 Review 1:This movie is very scary Review 2:This movie is not scary and is slow Review 3:This movie is spooky and good

BOW Vector Representation: 
 [(0, 3), (1, 1), (2, 3), (3, 4), (4, 1), (5, 2), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1)]

Dictionary Values: 
 {'Review': 0, '1:This': 1, 'movie': 2, 'is': 3, 'very': 4, 'scary': 5, '2:This': 6, 'not': 7, 'and': 8, 'slow': 9, '3:This': 10, 'spooky': 11, 'good': 12}


In [32]:
print("Review:\nVector reprsentation:(0,3)\nDictionary value:   'Review':0")
print("scary:\nVector reprsentation:(5,2)\nDictionary value:    'Review':5")

Review:
Vector reprsentation:(0,3)
Dictionary value:   'Review':0
scary:
Vector reprsentation:(5,2)
Dictionary value:    'Review':5


### TASK 2

In [6]:
#A) token based matching
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hey"},{"LOWER": "siri"}]                            
matcher.add("HeySiri", [pattern])
doc = nlp("Hey,Siri! Hey Siri!")                                      
matches = matcher(doc)
for match_id, start, end in matches:                                          
    string_id = nlp.vocab.strings[match_id] 
    span = doc[start:end] 
    print(span.text)                                                          

Hey Siri


In [7]:
#B) token based matching                                            
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hey"}, {"IS_PUNCT": True},{"LOWER": "siri"}]                            
matcher.add("HeySiri", [pattern])
doc = nlp("Hey,Siri! Hey Siri!")                                      
matches = matcher(doc)
for match_id, start, end in matches:                                          
    string_id = nlp.vocab.strings[match_id] 
    span = doc[start:end] 
    print(span.text)                                                          

Hey,Siri


### TASK 3

In [9]:
#Vector representation                                                       
doc = nlp1("apple orange pikkstn German")                                              
for token in doc:
    print('Text=',token.text,', Vector=',token.has_vector,', OOV=', token.is_oov)     

Text= apple , Vector= True , OOV= False
Text= orange , Vector= True , OOV= False
Text= pikkstn , Vector= False , OOV= True
Text= German , Vector= True , OOV= False


In [10]:
print('pikkstn is out of vocabulary')

pikkstn is out of vocabulary


### TASK 4

In [27]:
#EFFICIENT PHRASE MATCHING-case insensitive match patterns
from spacy.matcher import PhraseMatcher                                               
nlp = spacy.load("en_core_web_sm")                                                   
matcher = PhraseMatcher(nlp.vocab,attr="LOWER")
terms = ["ROTTEN mangoes", "sweet oranges"]
patterns = [nlp.make_doc(text) for text in terms]                                     
matcher.add("TerminologyList", patterns)
doc1 = nlp("Do not put rotten mangoes and sweet oranges together")
matches = matcher(doc1)
for match_id, start, end in matches:                                                  
    span = doc1[start:end]
    print(span.text)                                                                  

rotten mangoes
sweet oranges


### TASK 5

In [30]:
#Vector representation
doc = nlp("I prefer the morning flight through Denmark")           
for token in doc:
    print('Total length of output vectors:',token.vector.shape)                       
    print('Word Vector Representation:\n',token.vector)                

Total length of output vectors: (96,)
Word Vector Representation:
 [-0.06573781  0.34356946 -0.33065927  0.0828037   1.3537043   2.028841
 -0.45235258  0.19368112  1.2412268  -1.6775186   0.01488355  0.4813134
 -1.0602243  -1.1236367   1.2962728  -1.2600684   1.3997769   1.7463315
 -0.36026517  0.6132093   0.2727727  -0.8195218   0.13840258  0.4460678
 -0.77039903 -0.6711768   0.8262315  -0.4665807  -0.51960653 -0.6145005
  0.0505133  -0.51428145 -0.5847853  -0.6880101  -0.28734738 -0.9871149
 -1.1688107   0.5092508  -1.306536   -0.81702244 -0.08091132  0.42338413
 -0.11912183 -1.101825   -1.1702521  -0.61183274  0.9429907  -1.2650319
  0.60604    -1.3219539   1.0244133   1.7270942   1.3285427  -0.500371
  0.03123438  0.62746775  0.4628928  -0.49359262  1.1412562  -1.0653315
  0.13612919  0.440745    1.5435817  -0.45034686  0.24856904 -0.7066307
  1.0885928  -0.6219971   0.9163412  -0.677055    0.44630724  1.6822906
  1.4854448  -0.2115066  -0.38538712 -1.5729198  -0.85097414  0.136670

### TASK 6

In [17]:
# A    
doc1=nlp1("Do not put rotten mangoes and sweet oranges together")
for token in doc1:
    print('Text=',token.text,', Vector=',token.has_vector,', OOV=', token.is_oov)     

Text= Do , Vector= True , OOV= False
Text= not , Vector= True , OOV= False
Text= put , Vector= True , OOV= False
Text= rotten , Vector= True , OOV= False
Text= mangoes , Vector= True , OOV= False
Text= and , Vector= True , OOV= False
Text= sweet , Vector= True , OOV= False
Text= oranges , Vector= True , OOV= False
Text= together , Vector= True , OOV= False


In [20]:
# B and C
for token1 in doc1:                                                      
    for token2 in doc1:                                                
        print(token1.text, token2.text, token1.similarity(token2))      

Do Do 1.0
Do not 0.7205888032913208
Do put 0.6295328140258789
Do rotten 0.23457235097885132
Do mangoes 0.07004779577255249
Do and 0.40728652477264404
Do sweet 0.27546998858451843
Do oranges 0.184742733836174
Do together 0.450131356716156
not Do 0.7205888032913208
not not 1.0
not put 0.6083958745002747
not rotten 0.2806089520454407
not mangoes 0.10161229223012924
not and 0.5304263234138489
not sweet 0.3388059437274933
not oranges 0.17835381627082825
not together 0.4486272633075714
put Do 0.6295328140258789
put not 0.6083958745002747
put put 1.0
put rotten 0.31637996435165405
put mangoes 0.12202073633670807
put and 0.49793902039527893
put sweet 0.3838925361633301
put oranges 0.22029347717761993
put together 0.6148674488067627
rotten Do 0.23457235097885132
rotten not 0.2806089520454407
rotten put 0.31637996435165405
rotten rotten 1.0
rotten mangoes 0.3282413184642792
rotten and 0.20274914801120758
rotten sweet 0.35820987820625305
rotten oranges 0.3876388669013977
rotten together 0.2033028

In [28]:
print('A. The words rotten and sweet are out of vocabulary:False')
print('B. Similarity values between mangoes and oranges is 0.7255')
print('C. Similarity values between sweet and oranges is 0.4652')

A. The words rotten and sweet are out of vocabulary:False
B. Similarity values between mangoes and oranges is 0.7255
C. Similarity values between sweet and oranges is 0.4652


In [21]:
#END