In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import spacy
import pytextrank

C:\Users\hp\AppData\Local\Programs\Python\Python312\Lib\site-packages


In [36]:
nlp = spacy.load('en_core_web_lg')

In [4]:
text = '''Fog everywhere. Fog up the river, where it flows among green aits and meadows; fog down the river, where it rolls deified among the tiers of shipping and the waterside pollutions of a great (and dirty) city. Fog on the Essex marshes, fog on the Kentish heights. Fog creeping into the cabooses of collier-brigs; fog lying out on the yards and hovering in the rigging of great ships; fog drooping on the gunwales of barges and small boats. Fog in the eyes and throats of ancient Greenwich pensioners, wheezing by the firesides of their wards; fog in the stem and bowl of the afternoon pipe of the wrathful skipper, down in his close cabin; fog cruelly pinching the toes and fingers of his shivering little apprentice boy on deck. Chance people on the bridges peeping over the parapets into a nether sky of fog, with fog all round them, as if they were up in a balloon and hanging in the misty clouds.'''

In [37]:
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x251d2c91520>

In [38]:
doc = nlp(text)

#### Sentences

In [39]:
sentences = list(doc.sents)

In [40]:
print(len(sentences))
for sentence in sentences:
    print(f"{sentence[:4]}...")

6
Fog everywhere....
Fog up the river...
Fog on the Essex...
Fog creeping into the...
Fog in the eyes...
Chance people on the...


##### Separating sentences with a ; as delimiter

In [41]:
from spacy.language import Language

In [42]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==";":
            doc[token.i+1].is_sent_start = True
    return doc

In [43]:
custom_nlp = spacy.load("en_core_web_lg")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [44]:
custom_doc = custom_nlp(text)
custom_sentences = list(custom_doc.sents)
for sent in custom_sentences:
    print(f"{sent[:4]}...")

Fog everywhere....
Fog up the river...
fog down the river...
Fog on the Essex...
Fog creeping into the...
fog lying out on...
fog drooping on the...
Fog in the eyes...
fog in the stem...
fog cruelly pinching the...
Chance people on the...


#### Tokens

In [45]:
for token in doc[:15]:
    print(token, token.idx)

Fog 0
everywhere 4
. 14
Fog 16
up 20
the 23
river 27
, 32
where 34
it 40
flows 43
among 49
green 55
aits 61
and 66


In [46]:
print(f"{"Text with Whitespace":22}"
      f"{"Is Alphanumeric?":17}"
      f"{"Is Punctuation?":18}"
      f"{"Is Stop Word?"}")
for token in doc[:15]:
    print(f"{str(token.text_with_ws):22}"
          f"{str(token.is_alpha):17}"
          f"{str(token.is_punct):18}"
          f"{str(token.is_stop)}")

Text with Whitespace  Is Alphanumeric? Is Punctuation?   Is Stop Word?
Fog                   True             False             False
everywhere            True             False             True
.                     False            True              False
Fog                   True             False             False
up                    True             False             True
the                   True             False             True
river                 True             False             False
,                     False            True              False
where                 True             False             True
it                    True             False             True
flows                 True             False             False
among                 True             False             True
green                 True             False             False
aits                  True             False             False
and                   True             False         

#### Stopwords

In [47]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [48]:
stopwords = list(stopwords)

In [49]:
#[word for word in doc[:20] if not str(word).lower() in stopwords]

In [50]:
[word for word in doc[:20] if not word.is_stop]

[Fog, ., Fog, river, ,, flows, green, aits, meadows, ;, fog]

#### Lemmatization

In [51]:
custom_text = doc[:44]

In [52]:
for token in custom_text:
    if str(token).lower() != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

               flows : flow
                aits : ait
             meadows : meadow
               rolls : roll
             deified : deify
               tiers : tier
          pollutions : pollution


#### Word Frequency

In [53]:
from collections import Counter

In [54]:
words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

In [55]:
Counter(words).most_common(5)

[('fog', 13), ('river', 2), ('great', 2), ('flows', 1), ('green', 1)]

#### Parts of Speech

In [56]:
for token in doc[:5]:
    print(f"""Token:{token.text}
    Tag:{str(token.tag_):10} Pos:{str(token.pos_):}
    Explanation: {spacy.explain(token.tag_)}""")

Token:Fog
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:everywhere
    Tag:RB         Pos:ADV
    Explanation: adverb
Token:.
    Tag:.          Pos:PUNCT
    Explanation: punctuation mark, sentence closer
Token:Fog
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:up
    Tag:RP         Pos:ADP
    Explanation: adverb, particle


#### Visualization using displacy

In [57]:
from spacy import displacy

In [58]:
displacy.render(doc[:10], style="dep", jupyter=True)

In [59]:
for token in doc[3:14]:
    print(f""" {token.text}
    {token.tag_ = }
    {token.head.text = }
    {token.dep_ = }
    """)

 Fog
    token.tag_ = 'NN'
    token.head.text = 'fog'
    token.dep_ = 'advcl'
    
 up
    token.tag_ = 'RP'
    token.head.text = 'Fog'
    token.dep_ = 'prt'
    
 the
    token.tag_ = 'DT'
    token.head.text = 'river'
    token.dep_ = 'det'
    
 river
    token.tag_ = 'NN'
    token.head.text = 'Fog'
    token.dep_ = 'dobj'
    
 ,
    token.tag_ = ','
    token.head.text = 'river'
    token.dep_ = 'punct'
    
 where
    token.tag_ = 'WRB'
    token.head.text = 'flows'
    token.dep_ = 'advmod'
    
 it
    token.tag_ = 'PRP'
    token.head.text = 'flows'
    token.dep_ = 'nsubj'
    
 flows
    token.tag_ = 'VBZ'
    token.head.text = 'river'
    token.dep_ = 'relcl'
    
 among
    token.tag_ = 'IN'
    token.head.text = 'flows'
    token.dep_ = 'prep'
    
 green
    token.tag_ = 'NNP'
    token.head.text = 'aits'
    token.dep_ = 'compound'
    
 aits
    token.tag_ = 'NNS'
    token.head.text = 'among'
    token.dep_ = 'pobj'
    


In [60]:
displacy.render(doc[3:14], style="dep", jupyter=True)

#### Noun Detection

In [61]:
for chunk in doc[:10].noun_chunks:
    print(chunk)

Fog
the river
it


#### Named Entity Recognition

In [62]:
for ent in doc.ents:
    print(f""" {ent.text =}
    {ent.label_=}
    {ent.start=}
    {ent.end=}
    {ent.label_}={spacy.explain(ent.label_)}
    """)

 ent.text ='Essex'
    ent.label_='GPE'
    ent.start=47
    ent.end=48
    GPE=Countries, cities, states
    
 ent.text ='Kentish'
    ent.label_='GPE'
    ent.start=53
    ent.end=54
    GPE=Countries, cities, states
    
 ent.text ='collier-brigs'
    ent.label_='ORG'
    ent.start=62
    ent.end=65
    ORG=Companies, agencies, institutions, etc.
    
 ent.text ='Greenwich'
    ent.label_='GPE'
    ent.start=100
    ent.end=101
    GPE=Countries, cities, states
    
 ent.text ='afternoon'
    ent.label_='TIME'
    ent.start=119
    ent.end=120
    TIME=Times smaller than a day
    


In [63]:
displacy.render(doc, style="ent", jupyter=True)

#### Text Summarization

In [64]:
for sent in doc._.textrank.summary(limit_phrases=2, limit_sentences=2):
    print(sent)
    print('Summary Length:',len(sent))

Fog creeping into the cabooses of collier-brigs; fog lying out on the yards and hovering in the rigging of great ships; fog drooping on the gunwales of barges and small boats.
Summary Length: 36
Fog in the eyes and throats of ancient Greenwich pensioners, wheezing by the firesides of their wards; fog in the stem and bowl of the afternoon pipe of the wrathful skipper, down in his close cabin; fog cruelly pinching the toes and fingers of his shivering little apprentice boy on deck.
Summary Length: 56
