# Importing Spacy

In [33]:
#importing SpaCy
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_lg")


# Pulling 3 Web Articles

In [44]:
#pull an article from the web and use it as our data, a reuters new article:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Specify 'html.parser' as the parser
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
#fetches the content of the NFL news article from the specified URL.

a = url_to_string('https://medium.com/towards-data-science/four-data-engineering-projects-that-look-great-on-your-cv-069dffae95e0')
b= url_to_string('https://medium.com/@rmoklesur/understanding-kl-divergence-a-fundamental-measure-in-machine-learning-2ebdf384851')
c=url_to_string('https://medium.com/@marianvilla/cre%C3%A9-la-misma-aplicaci%C3%B3n-en-react-vue-aqu%C3%AD-las-diferencias-605465a2e18c')
article1 = nlp(a)
article2 = nlp(b)
article3 = nlp(c)
len(article1.ents)
len(article2.ents)
len(article3.ents)


530

# Visulisation on NERS 

In [45]:
#visualization style is set to 'ent'
displacy.render(article1, style='ent', jupyter=True)
displacy.render(article2, style='ent', jupyter=True)
displacy.render(article3, style='ent', jupyter=True)

# Most Popular NER Types

In [49]:
#imports the Counter class from the collections
from collections import Counter
#labels of named entities in the web article are extracted and stored 
labels1 = [x.label_ for x in article1.ents]
Counter(labels)
labels2 = [x.label_ for x in article2.ents]
Counter(labels)
labels3 = [x.label_ for x in article3.ents]
Counter(labels1)


Counter({'ORG': 3, 'PERSON': 4, 'DATE': 2, 'GPE': 1})

In [50]:
Counter(labels2)


Counter({'PERSON': 4, 'CARDINAL': 2, 'DATE': 2, 'ORG': 1, 'GPE': 1, 'TIME': 1})

In [51]:
Counter(labels3)

Counter({'ORG': 192,
         'PERSON': 254,
         'GPE': 53,
         'CARDINAL': 9,
         'DATE': 1,
         'PRODUCT': 9,
         'NORP': 6,
         'MONEY': 1,
         'WORK_OF_ART': 2,
         'FAC': 3})

In [52]:
# Importing the Counter class from the collections module
items = [x.text for x in article1.ents]
Counter(items).most_common(5)


[('Four Data Engineering Projects', 1),
 ('💡Mike Shakhomirov', 1),
 ('Mar, 2024 |', 1),
 ('Data Engineering Projects That Look Great', 1),
 ('CVData', 1)]

In [53]:
items = [x.text for x in article2.ents]
Counter(items).most_common(5)

[('Moklesur Rahman', 1),
 ('18', 1),
 ('2023', 1),
 ('One', 1),
 ('Kullback-Leibler', 1)]

In [54]:
items = [x.text for x in article3.ents]
Counter(items).most_common(5)

[('un', 18),
 ('Así', 5),
 ('campo de entrada', 4),
 ('encuentra', 4),
 ('como tal', 4)]

# Extracting Sentences 

In [60]:
# Extracting the sentences from the processed web article using spaCy's sentence
sentences1 = [x for x in article1.sents]
print(sentences1[0])

Four Data Engineering Projects That Look Great on your CV | by 💡Mike Shakhomirov | Mar, 2024 | Towards Data ScienceOpen in appSign upSign inWriteSign upSign inMember-only storyFour Data Engineering Projects That Look Great on your CVData pipelines that would turn you into a decorated data professional💡Mike Shakhomirov·FollowPublished inTowards Data Science·10 min read·4 days ago--6ShareAI-generated image using KandinskyIn this story, I would like to speak about data engineering career paths and data projects that look great on any CV.
Understanding KL Divergence: A Fundamental Measure in Machine Learning | by Moklesur Rahman | MediumOpen in appSign upSign


In [65]:
sentences2 = [x for x in article2.sents]
print(sentences2[0])

Understanding KL Divergence: A Fundamental Measure in Machine Learning | by Moklesur Rahman | MediumOpen in appSign upSign


In [70]:
sentences3 = [x for x in article3.sents]
print(sentences3[10])

Por “diferente”, no quise decir cosas


# Displaying NER Tags

In [71]:
# Rendering a visualization of named entities in the first sentence
displacy.render(nlp(str(sentences1[0])), jupyter=True, style='ent')

In [72]:
# Rendering a visualization of named entities in the first sentence
displacy.render(nlp(str(sentences2[0])), jupyter=True, style='ent')

In [73]:
# Rendering a visualization of named entities in the first sentence
displacy.render(nlp(str(sentences3[10])), jupyter=True, style='ent')

# Types of Sentences

In [74]:
#filters out tokens that are stop words or punctuation marks, then retrieves the specified attributes for each token.
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences1[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Data', 'PROPN', 'Data'),
 ('Engineering', 'PROPN', 'Engineering'),
 ('Projects', 'NOUN', 'project'),
 ('Look', 'VERB', 'look'),
 ('Great', 'ADV', 'great'),
 ('CV', 'NOUN', 'cv'),
 ('|', 'VERB', '|'),
 ('💡', 'NUM', '💡'),
 ('Mike', 'PROPN', 'Mike'),
 ('Shakhomirov', 'PROPN', 'Shakhomirov'),
 ('|', 'VERB', '|'),
 ('Mar', 'PROPN', 'Mar'),
 ('2024', 'NUM', '2024'),
 ('Data', 'PROPN', 'Data'),
 ('ScienceOpen', 'PROPN', 'ScienceOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inWriteSign', 'PROPN', 'inWriteSign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inMember', 'PROPN', 'inMember'),
 ('Data', 'PROPN', 'Data'),
 ('Engineering', 'PROPN', 'Engineering'),
 ('Projects', 'NOUN', 'project'),
 ('Look', 'VERB', 'look'),
 ('Great', 'ADV', 'great'),
 ('CVData', 'PROPN', 'CVData'),
 ('pipelines', 'NOUN', 'pipeline'),
 ('turn', 'VERB', 'turn'),
 ('decorated', 'VERB', 'decorate'),
 ('data', 'NOUN', 'data'),
 ('professional', 'NOUN', 'professional'),
 ('Mike', 'PROPN', 'Mike'),
 ('Sh

In [75]:
#filters out tokens that are stop words or punctuation marks, then retrieves the specified attributes for each token.
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences2[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Understanding', 'VERB', 'understand'),
 ('KL', 'PROPN', 'KL'),
 ('Divergence', 'NOUN', 'divergence'),
 ('Fundamental', 'ADJ', 'fundamental'),
 ('Measure', 'NOUN', 'measure'),
 ('Machine', 'PROPN', 'Machine'),
 ('Learning', 'PROPN', 'Learning'),
 ('|', 'VERB', '|'),
 ('Moklesur', 'PROPN', 'Moklesur'),
 ('Rahman', 'PROPN', 'Rahman'),
 ('|', 'VERB', '|'),
 ('MediumOpen', 'PROPN', 'MediumOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'NOUN', 'upsign')]

In [77]:
#filters out tokens that are stop words or punctuation marks, then retrieves the specified attributes for each token.
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences3[10])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Por', 'PROPN', 'Por'),
 ('diferente', 'PROPN', 'diferente'),
 ('quise', 'NOUN', 'quise'),
 ('decir', 'VERB', 'decir'),
 ('cosas', 'PROPN', 'cosas')]

# Sentence Dependency tree

In [78]:
#Rendering a visualization of dependency parsing in the first sentence
displacy.render(nlp(str(sentences1[0])), style='dep', jupyter = True, options = {'distance': 120})

In [79]:
#Rendering a visualization of dependency parsing in the first sentence
displacy.render(nlp(str(sentences2[0])), style='dep', jupyter = True, options = {'distance': 120})

In [80]:
#Rendering a visualization of dependency parsing in the first sentence
displacy.render(nlp(str(sentences3[10])), style='dep', jupyter = True, options = {'distance': 120})