In [1]:
string = '''A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. Paragraphs can contain many different kinds of information. A paragraph could contain a series of brief examples or a single long illustration of a general point. It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects. Regardless of the kind of information they contain, all paragraphs share certain characteristics. One of the most important of these is a topic sentence.'''

In [2]:
string

'A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. Paragraphs can contain many different kinds of information. A paragraph could contain a series of brief examples or a single long illustration of a general point. It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects. Regardless of the kind of information they contain, all paragraphs share certain characteristics. One of the most important of these is a topic sentence.'

In [3]:
import numpy as np
import nltk
import torch
import transformers
import spacy
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
from spacy import displacy
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

In [4]:
# Convert paragraph into list of sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(string.strip())
sentences = [sentence for sentence in raw_sentences if len(sentence) > 0]
print('Sentences :')
print(sentences, end='\n\n\n')

Sentences :
['A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic.', 'Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs.', 'This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points.', 'Paragraphs can contain many different kinds of information.', 'A paragraph could contain a series of brief examples or a single long illustration of a general point.', 'It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects.', 'Regardless of the kind of information they contain, all paragraphs share certain characteristics.', 'One of the most important of these is a topic sentence.']




In [5]:
# 1.i) Word2Vec

wordvecs = [nltk.word_tokenize(sentence) for sentence in sentences]
stop_words = list(set(stopwords.words("english")))

for sentence in wordvecs:
    for word in sentence:
        if word in stop_words:
            sentence.remove(word)

model = Word2Vec(wordvecs, min_count=1)

# Vector for word 'sentence'
print('Vector for word "sentence" : ')
print(model.wv['sentence'])

Vector for word "sentence" : 
[ 5.1207771e-03 -4.4107656e-03 -9.2051858e-03 -9.0583870e-03
  6.2181242e-03 -5.2418937e-03  5.2688741e-03 -1.5554887e-03
  9.6765831e-03  7.1525103e-03  9.8813595e-03  8.3572073e-03
 -7.8599807e-03 -9.5791658e-03 -5.5657760e-03 -8.1351801e-04
  5.0154519e-03  3.6702869e-03 -2.0397545e-03  5.0118039e-03
 -3.3978079e-03 -6.7980345e-03  4.7297091e-03 -9.8678144e-03
  8.8420842e-04 -5.0069601e-03  7.5918878e-03 -3.7404767e-03
  2.8073478e-03 -6.2389341e-03  9.6416324e-03 -5.5988207e-03
  3.6009781e-03  3.2136077e-03  8.9124087e-03 -5.5417670e-03
 -4.3727458e-03 -6.2086955e-03  6.1387308e-03 -4.5998693e-03
  4.8208917e-03 -8.9610508e-04  7.2694193e-03 -6.0103405e-03
 -7.9307887e-05 -1.0296506e-04 -8.5515967e-03  8.7202508e-03
 -2.0959489e-03  6.4399657e-03  9.4515606e-05  5.0101718e-03
  2.2122220e-03  8.8246176e-03 -8.3458591e-03  8.4297704e-03
  8.6530289e-03  1.0469728e-03  4.4118296e-03  3.8222342e-03
  4.0765288e-03 -2.9526122e-03  2.8707823e-03  7.756452

In [6]:
# 1.ii) USE

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(sentences)
print(embeddings, '\n\n')
for i in range(2):
    print('Sentence :')
    print(sentences[i])
    print('Converted to :')
    print(embeddings[i], '\n\n')

tf.Tensor(
[[ 0.01168494 -0.03060572  0.06116334 ... -0.08641756  0.00025049
   0.05482749]
 [ 0.02972509 -0.03655469  0.08002593 ... -0.07038907 -0.02832131
   0.04804677]
 [ 0.07221662 -0.04182237  0.05336685 ... -0.06942353  0.01795928
   0.06641504]
 ...
 [ 0.01586944 -0.05243037  0.06065089 ... -0.0643559   0.04215747
   0.06304203]
 [ 0.04141244  0.02588909 -0.00625631 ... -0.02162989  0.00910817
   0.03623574]
 [ 0.01578411 -0.02142679  0.00402448 ... -0.09605587 -0.06707881
   0.07970382]], shape=(8, 512), dtype=float32) 


Sentence :
A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic.
Converted to :
tf.Tensor(
[ 1.16849439e-02 -3.06057241e-02  6.11633360e-02  8.47723708e-02
 -5.83404116e-03  2.84160231e-03  2.59479377e-02  3.90260434e-03
 -5.55586144e-02  5.68111017e-02 -8.95013753e-03 -4.50469833e-03
 -6.06310330e-02  3.18566412e-02 -6.86047673e-02 -9.39451456e-02
 -4.23613563e-02  3.93057056e-02 -9.02280435e-02 -5.5366

In [None]:
# 1. iii) ELMO

# elmo=hub.Module("https://tfhub.dev/google/elmo/3",trainable=True)
# embeddings=elmo(sentences, signature="default", as_dict=True)["elmo"]

# init=tf.initialize_all_variables()
# sess=tf.Session()
# sess.run(init)

# print(sess.run(embeddings[0]))


# Execution Issues

In [7]:
# 1.iv) GPT2

gp2tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')
res_vectors = gp2tokenizer.encode(string, add_special_tokens=False, return_tensors="pt")
print("shape=", res_vectors.shape)
print(res_vectors)

shape= torch.Size([1, 171])
tensor([[   32,  7322,   318,   257,  2168,   286, 13439,   326,   389,  8389,
           290, 24870,    11,   290,   389,   477,  3519,   284,   257,  2060,
          7243,    13, 16699,   790,  3704,   286,  3597,   345,   466,   326,
           318,  2392,   621,   257,  1178, 13439,   815,   307,  8389,   656,
         23549,    13,   770,   318,   780, 23549,   905,   257,  9173,   810,
           262, 45944,  3279,   286,   281, 14268,  2221,   290,   886,    11,
           290,  4145,  1037,   262,  9173,   766,   262,  4009,   286,   262,
         14268,   290, 13180,   663,  1388,  2173,    13,  2547,  6111,    82,
           460,  3994,   867,  1180,  6982,   286,  1321,    13,   317,  7322,
           714,  3994,   257,  2168,   286,  4506,  6096,   393,   257,  2060,
           890, 20936,   286,   257,  2276,   966,    13,   632,  1244,  6901,
           257,  1295,    11,  2095,    11,   393,  1429,    26,  6664,   378,
           257,  2168,  

In [8]:
# 1.v) Sentence-BERT

bert = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2")
embeddings = bert(sentences)
print(embeddings)
print("shape=",embeddings[0].shape)
print("The sentence in the paragraph:\n",sentences[0],"\nis converted into vector as :\n", embeddings[0])





tf.Tensor(
[[ 0.58439595  0.03570241  0.07089429 ...  0.16468506  0.01362591
  -0.17049454]
 [ 0.5674903  -0.03022472  0.14544438 ...  0.08465072  0.04284173
   0.03154207]
 [ 0.83034104  0.16388969 -0.03244966 ... -0.20251456  0.12385168
   0.08940062]
 ...
 [ 0.4393374   0.02327457  0.12449443 ...  0.05474888 -0.09519409
   0.00303834]
 [ 0.2909918   0.06627773  0.08403497 ... -0.10017543 -0.11646989
   0.00867226]
 [ 0.5081561  -0.04534546  0.09999924 ...  0.06375446 -0.00892024
  -0.03128232]], shape=(8, 128), dtype=float32)
shape= (128,)
The sentence in the paragraph:
 A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
is converted into vector as :
 tf.Tensor(
[ 0.58439595  0.03570241  0.07089429  0.07733776 -0.01214658 -0.12435579
 -0.07824828 -0.00274544 -0.17964575  0.21627969  0.03844824 -0.19277166
 -0.12646586  0.02667335 -0.13363229 -0.00374018 -0.06618838  0.00353754
 -0.21084203  0.18731229  0.06417363  0.03025784

In [9]:
# 2) Named Entity Recognition

nlp = spacy.load("en_core_web_sm")
res = nlp(string)

for word in res.ents:
    print(word.text,word.label_)
    
print(spacy.explain('GPE'))

print(displacy.render(res,style="ent",jupyter=True))

two CARDINAL
One CARDINAL
Countries, cities, states


None


In [10]:
# 3) Find similar sentences (repeated sentences) from the above paragraph? (Cosine Similarity, use BERT to encode)

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

se_embeddings = sbert_model.encode(sentences)
q1_vec= sbert_model.encode(sentences[0])

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

for sent in sentences:
    sim = cosine(q1_vec, sbert_model.encode([sent])[0])
    # similarity == 1 - repeated sentence
    # similarity > 0.5 - similar sentence
    if sim>0.5:
        print("Sentence1 =",sentences[0],"\n \nSentence2=", sent, "\n\nsimilarity = ", sim,end="\n ----------------------------- \n")

Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 

similarity =  1.0000001
 ----------------------------- 
Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. 

similarity =  0.6477538
 ----------------------------- 
Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. 

similarity =  0.5238008
 ----------------------------- 
Sentence1 = A paragraph i

In [11]:
# 4) POS Tagging for Above Given Paragraph
    
tokenized = sent_tokenize(string)
for i in tokenized:

    wordList = nltk.word_tokenize(i)
    wordList = [word for word in wordList if word not in stop_words]

    tagged = nltk.pos_tag(wordList)
 
    print(tagged)

[('A', 'DT'), ('paragraph', 'NN'), ('series', 'NN'), ('sentences', 'NNS'), ('organized', 'VBN'), ('coherent', 'NN'), (',', ','), ('related', 'VBN'), ('single', 'JJ'), ('topic', 'NN'), ('.', '.')]
[('Almost', 'RB'), ('every', 'DT'), ('piece', 'NN'), ('writing', 'VBG'), ('longer', 'JJR'), ('sentences', 'NNS'), ('organized', 'VBN'), ('paragraphs', 'NN'), ('.', '.')]
[('This', 'DT'), ('paragraphs', 'NN'), ('show', 'NN'), ('reader', 'NN'), ('subdivisions', 'NNS'), ('essay', 'VBP'), ('begin', 'JJ'), ('end', 'NN'), (',', ','), ('thus', 'RB'), ('help', 'NN'), ('reader', 'VB'), ('see', 'VB'), ('organization', 'NN'), ('essay', 'VB'), ('grasp', 'NN'), ('main', 'JJ'), ('points', 'NNS'), ('.', '.')]
[('Paragraphs', 'NNP'), ('contain', 'VBP'), ('many', 'JJ'), ('different', 'JJ'), ('kinds', 'NNS'), ('information', 'NN'), ('.', '.')]
[('A', 'DT'), ('paragraph', 'NN'), ('could', 'MD'), ('contain', 'VB'), ('series', 'NN'), ('brief', 'NN'), ('examples', 'VBZ'), ('single', 'JJ'), ('long', 'JJ'), ('illustr