In [4]:
import spacy                                                  # Import spaCy library
from spacy.lang.en import English                             # Import specific model
nlp = spacy.load("en_core_web_sm")                            # Load model
import collections
from typing import Dict, List, Tuple                          # import dictionaries 

### TASK - 1

In [18]:
#REVIEW 1

def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:                  # Text to BOW
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                                    # Check condition
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())                                                              # Return word frequencies
sample_text = 'Review 1 : This movie is very scary and long'                                                     # Input text
dictionary = {}                                                                                       # Initialize dictionary
print('\nBOW Representation: \n', text2bow(sample_text.split(), dictionary))                          # print BOW
print('Input Text:\n',sample_text)                                                                    # print input
print('\nDictionary: \n', dictionary)                                                                 # print dictionary values


BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Input Text:
 Review 1 : This movie is very scary and long

Dictionary: 
 {'Review': 0, '1': 1, ':': 2, 'This': 3, 'movie': 4, 'is': 5, 'very': 6, 'scary': 7, 'and': 8, 'long': 9}


In [17]:
#REVIEW 2

def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:                  # Text to BOW
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                                    # Check condition
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())                                                              # Return word frequencies
sample_text = 'Review 2 : This movie is not scary and is slow'                                                    # Input text
dictionary = {}                                                                                       # Initialize dictionary
print('\nBOW Representation: \n', text2bow(sample_text.split(), dictionary))                          # print BOW
print('Input Text:\n',sample_text)                                                                    # print input
print('\nDictionary: \n', dictionary)                                                                 # print dictionary values


BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1)]
Input Text:
 Review 2 : This movie is not scary and is slow

Dictionary: 
 {'Review': 0, '2': 1, ':': 2, 'This': 3, 'movie': 4, 'is': 5, 'not': 6, 'scary': 7, 'and': 8, 'slow': 9}


In [15]:
#REVIEW 3

def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:                  # Text to BOW
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                                    # Check condition
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())                                                              # Return word frequencies
sample_text = 'Review 3 : This movie is spooky and good'                                                          # Input text
dictionary = {}                                                                                       # Initialize dictionary
print('\nBOW Representation: \n', text2bow(sample_text.split(), dictionary))                          # print BOW
print('Input Text:\n',sample_text)                                                                    # print input
print('\nDictionary: \n', dictionary)                                                                 # print dictionary values


BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
Input Text:
 Review 3 : This movie is spooky and good

Dictionary: 
 {'Review': 0, '3': 1, ':': 2, 'This': 3, 'movie': 4, 'is': 5, 'spooky': 6, 'and': 7, 'good': 8}


In [27]:
doc = nlp(u'Review')                                       # for single word
print('Vector Length:\n',doc.vector.shape)                # length of vector
print('Word Vector Representation:\n',doc.vector)         # print output

Vector Length:
 (96,)
Word Vector Representation:
 [ 0.08444607  0.12996519  0.25090355 -0.04837775  1.1570647   1.0232297
  0.51766723  0.63749    -0.21312581 -0.2526669  -0.72337085 -0.11484593
  2.056591   -0.43503147  0.0512511  -0.7099442  -0.35425502 -0.8625312
  0.9220847   0.08002502 -0.22490364 -0.0218206  -0.45728493 -0.85805416
  0.17037234 -0.01779819 -0.18357205 -1.2258139  -0.32962835 -0.8643147
 -0.42144     0.5384248   0.04062448  0.50345683 -0.17407241 -2.0512395
 -0.48872298  1.2492242   0.53873146  1.6154748  -0.6652746  -0.76472753
  0.5348292   0.09983801 -1.4636991  -0.14339338 -0.15525445  0.29389852
  1.1805356  -0.11594096  0.19914103 -0.20053369 -0.03914784 -0.51784486
 -0.06851092 -0.40133905 -0.3348826  -0.21894021 -0.59389377  0.11161892
  0.02002376  2.1144261   1.060673   -1.3011045   0.5552813  -1.0625291
 -0.6888257  -0.40453744  0.22568427 -0.35341474 -0.9439845   0.19931321
 -1.2395508  -0.34215322  0.66172147 -0.6072136   0.27366182  1.2309372
  0.23

In [29]:
doc = nlp(u'scary')                                       # for single word
print('Vector Length:\n',doc.vector.shape)                # length of vector
print('Word Vector Representation:\n',doc.vector)         # print output

Vector Length:
 (96,)
Word Vector Representation:
 [-1.230675    1.3622421   0.41508293  0.23442823 -0.39748314 -0.49457446
 -0.13032854  0.75998557 -0.21790811 -0.05159    -0.6336853  -0.17683047
 -0.03390747  1.0648445   0.45245054 -1.7207558   0.64101887 -0.70688057
  1.443763    0.18873107  0.20001148 -0.6011127  -0.9655781   0.00813657
 -0.6132463  -1.4529294  -0.1304617   1.1534377  -0.5185296  -0.23840518
  0.41837204 -1.3561993   0.13022666  1.0625572  -0.16988535 -0.8372294
  0.2245235   0.81401145  0.29627255  1.2650975  -0.93742204 -0.9593301
  0.531508   -0.04786337 -0.28503162 -0.44319257 -0.14379951 -0.1968005
  1.2327617   1.5501475   0.048411    0.02398767 -1.095253    0.9863709
  0.9576199   0.27519786 -0.2514766  -0.7966854   0.0528297   0.388033
 -0.791499   -0.21451102  0.2082287  -0.02687192 -0.19569847 -0.73481524
 -0.2052333  -0.30613923 -0.03286523 -0.5631656   0.1688798  -0.4884945
 -0.06226408  0.0807623   1.3848456   0.14010814 -0.26180255  1.5808759
  0.0857

### TASK - 2

In [46]:
# Pattern 1

from spacy.matcher import Matcher                                             # Import matcher
nlp = spacy.load("en_core_web_sm")                                            # Load model
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hey"}, {"LOWER": "siri"}]                            # Add match ID "HelloWorld" with no callback and one pattern
matcher.add("HeySiri", [pattern])
doc = nlp("Hey, Siri! Hey siri!")                                       # Input text
matches = matcher(doc)
for match_id, start, end in matches:                                          # Find matches
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    #print(match_id, string_id, start, end, span.text)
    print(span.text)                                                          # print output

Hey siri


In [50]:
# Pattern 2

from spacy.matcher import Matcher                                             # Import matcher
nlp = spacy.load("en_core_web_sm")                                            # Load model
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hey"}, {"IS_PUNCT": True}, {"LOWER": "siri"}]        # Add match ID "HelloWorld" with no callback and one pattern
#pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("Heysiri", [pattern])
doc = nlp("Hey, Siri! Hey Siri!")                                       # Input text
matches = matcher(doc)
for match_id, start, end in matches:                                          # Find matches
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    #print(match_id, string_id, start, end, span.text)
    print(span.text)                                                          # print output

Hey, Siri


In [11]:
### Download the following Models 
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_md

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     ----------                           221.3/777.4 MB 327.3 kB/s eta 0:28:19


ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\Manish\anaconda3\lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\Manish\anaconda3\lib\site-packages\pip\_vendor\urllib3\response.py", line 519, in read
    data = self._fp.read(amt) if not fp_closed else b""
  File "C:\Users\Manish\anaconda3\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
  File "C:\Users\Manish\anaconda3\lib\http\client.py", line 462, in read
    n = self.readinto(b)
  File "C:\Users\Manish\anaconda3\lib\http\client.py", line 506, in readinto
    n = self.fp.readinto(b)
  File "C:\Users\Manish\anaconda3\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\Manish\anaconda3\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Users\Manish\anaconda3\lib\ssl.py", line 1099, in read
    return self._sslobj.read(len, buf

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     ------------------------------------ 777.4/777.4 MB 480.1 kB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
     ---------------------------------------- 45.7/45.7 MB 1.7 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


# TASK - 3

In [16]:

import spacy
nlp = spacy.load("en_core_web_lg")                                                    # Load model    
doc = nlp("apple orange pikkstn German")                                                # Input text
for token in doc:
    print('Text=',token.text,', Vector=',token.has_vector,', OOV=', token.is_oov)     # Check words in vocab or not

Text= apple , Vector= True , OOV= False
Text= orange , Vector= True , OOV= False
Text= pikkstn , Vector= False , OOV= True
Text= German , Vector= True , OOV= False


### TASK - 4

In [15]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["ROTTEN mangoes", "sweet oranges"]]
matcher.add("FRUIT", patterns)
doc = nlp("Do not put rotten mangoes and sweet oranges together.")
for match_id, start, end in matcher(doc):
    print("Matched based on lowercase token text:", doc[start:end])

Matched based on lowercase token text: rotten mangoes
Matched based on lowercase token text: sweet oranges


### TASK - 5

In [12]:
doc1 = nlp(u'I prefer the morning flight through Denmark')         # for single word
print('Vector Length:\n',doc1.vector.shape)                        # length of vector
print('Word Vector Representation:\n',doc1.vector)                 # print output

Vector Length:
 (300,)
Word Vector Representation:
 [ 2.24614009e-01  6.87758625e-02 -1.73480585e-01  1.65805761e-02
  1.34624004e-01  5.45591712e-02  2.19285907e-03 -3.95957045e-02
  6.82792813e-02  1.97639692e+00 -4.61678565e-01 -2.92714275e-02
  1.66712284e-01  3.05328630e-02 -1.20274566e-01 -7.90856779e-03
 -2.90945709e-01  1.11722434e+00  6.90842792e-02 -3.84949967e-02
  1.04422286e-01  1.60053283e-01 -8.62417221e-02 -1.08257428e-01
 -1.23409428e-01 -8.73568282e-02 -2.03284860e-01 -2.21845388e-01
  1.08609453e-01  1.16122566e-01 -1.37289673e-01  3.65837142e-02
 -5.93178980e-02  2.00193852e-01 -5.70568405e-02  2.11971134e-01
  3.52973565e-02  3.17453407e-02 -1.05713382e-01 -1.46389633e-01
  1.32286716e-02 -7.17225596e-02 -8.99554342e-02  3.06991637e-02
  1.77117258e-01  1.83081269e-01 -1.73389629e-01 -4.74941507e-02
  1.44464582e-01  4.47510071e-02 -1.53409272e-01 -9.95019898e-02
 -1.11263432e-01  9.11899880e-02 -4.40797508e-02  3.18064317e-02
 -9.49835777e-02 -7.99477175e-02  6.82

Total vector length: 300

### TASK - 6

In [13]:
nlp = spacy.load("en_core_web_lg")                                                    # Load model    
doc = nlp("Do not put rotten mangoes and sweet oranges together.")                    # Input text
for token in doc:
    print('Text=',token.text,', Vector=',token.has_vector,', OOV=', token.is_oov)     # Check words in vocab or not

Text= Do , Vector= True , OOV= False
Text= not , Vector= True , OOV= False
Text= put , Vector= True , OOV= False
Text= rotten , Vector= True , OOV= False
Text= mangoes , Vector= True , OOV= False
Text= and , Vector= True , OOV= False
Text= sweet , Vector= True , OOV= False
Text= oranges , Vector= True , OOV= False
Text= together , Vector= True , OOV= False
Text= . , Vector= True , OOV= False


#### A) False, because the word "rotten" and "sweet" are not the out of vocabulary.

In [14]:
for token1 in doc:                                                      # For token1
    for token2 in doc:                                                  # For token 2
        print(token1.text, token2.text, token1.similarity(token2))      # check similarity of token 1 with token 2

Do Do 1.0
Do not 0.7205888032913208
Do put 0.6295328140258789
Do rotten 0.23457235097885132
Do mangoes 0.07004779577255249
Do and 0.40728649497032166
Do sweet 0.27546998858451843
Do oranges 0.184742733836174
Do together 0.4501313865184784
Do . 0.3056337535381317
not Do 0.7205888032913208
not not 1.0
not put 0.6083958148956299
not rotten 0.2806089222431183
not mangoes 0.10161228477954865
not and 0.5304263830184937
not sweet 0.3388059437274933
not oranges 0.17835381627082825
not together 0.448627233505249
not . 0.4248487055301666
put Do 0.6295328140258789
put not 0.6083958148956299
put put 1.0
put rotten 0.3163799047470093
put mangoes 0.12202072143554688
put and 0.49793902039527893
put sweet 0.3838925063610077
put oranges 0.22029344737529755
put together 0.6148674488067627
put . 0.390465646982193
rotten Do 0.23457235097885132
rotten not 0.2806089222431183
rotten put 0.3163799047470093
rotten rotten 1.0
rotten mangoes 0.3282413184642792
rotten and 0.2027491182088852
rotten sweet 0.3582098

#### B) Similarity values belween: mangoes oranges 0.7255765795707703

#### C) Similarity values between: sweet oranges 0.4652591049671173