In [1]:
import spacy
from collections import Counter
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

text = """
In an unprecedented move, the government of the United States announced new measures to combat climate change during a press conference in Washington, D.C.
President Joe Biden emphasized the importance of global cooperation, stating that "we must act together to preserve our planet."
Meanwhile, in Europe, the European Union unveiled plans to invest billions in renewable energy infrastructure across member states.
In Asia, China reported that it would accelerate its development of electric vehicles and reduce carbon emissions by 2030.
Analysts predict that these measures will reshape the global energy landscape and foster new innovations in green technology.
"""

In [2]:
doc = nlp(text)

In [3]:
# Wypisz token -> POS -> lemma
for token in doc:
   print(f"{token.text} -> {token.pos_} -> {token.lemma_}")


 -> SPACE -> 

In -> ADP -> in
an -> DET -> an
unprecedented -> ADJ -> unprecedented
move -> NOUN -> move
, -> PUNCT -> ,
the -> DET -> the
government -> NOUN -> government
of -> ADP -> of
the -> DET -> the
United -> PROPN -> United
States -> PROPN -> States
announced -> VERB -> announce
new -> ADJ -> new
measures -> NOUN -> measure
to -> PART -> to
combat -> VERB -> combat
climate -> NOUN -> climate
change -> NOUN -> change
during -> ADP -> during
a -> DET -> a
press -> NOUN -> press
conference -> NOUN -> conference
in -> ADP -> in
Washington -> PROPN -> Washington
, -> PUNCT -> ,
D.C. -> PROPN -> D.C.

 -> SPACE -> 

President -> PROPN -> President
Joe -> PROPN -> Joe
Biden -> PROPN -> Biden
emphasized -> VERB -> emphasize
the -> DET -> the
importance -> NOUN -> importance
of -> ADP -> of
global -> ADJ -> global
cooperation -> NOUN -> cooperation
, -> PUNCT -> ,
stating -> VERB -> state
that -> SCONJ -> that
" -> PUNCT -> "
we -> PRON -> we
must -> AUX -> must
act -> VERB -> act
tog

In [4]:
# Wypisz kategorie NER wraz z labelkami
for ent in doc.ents:
  print(f"{ent.text} -> {ent.label_}")

the United States -> GPE
Washington -> GPE
D.C. -> GPE
Joe Biden -> PERSON
Europe -> LOC
the European Union -> ORG
billions -> CARDINAL
Asia -> LOC
China -> GPE
2030 -> DATE


In [5]:
# NOUN CHUNKS - wypisz root i head dla każdego chunka

for chunk in doc.noun_chunks:
  print(f"{chunk.root.text} -> {chunk.root.head.text}")


move -> In
government -> announced
States -> of
measures -> announced
change -> combat
conference -> during
Biden -> in
importance -> emphasized
cooperation -> of
we -> act
planet -> preserve
Europe -> in
Union -> unveiled
plans -> unveiled
billions -> invest
infrastructure -> in
states -> across
Asia -> In
China -> reported
it -> accelerate
development -> accelerate
vehicles -> of
emissions -> reduce
Analysts -> predict
measures -> reshape
landscape -> reshape
technology -> in


In [13]:
# Wskaż fragmenty, w których chunk się zgadza z NER
for ent in doc.ents:
  for chunk in doc.noun_chunks:
    if chunk.text == ent.text:
      print(f"{ent.text} -> {ent.label_} -> {chunk.text}")


the United States -> GPE -> the United States
Europe -> LOC -> Europe
the European Union -> ORG -> the European Union
billions -> CARDINAL -> billions
Asia -> LOC -> Asia
China -> GPE -> China


In [9]:
# Dla każdego entity wypisz +/- 2 tokeny (2 tokeny przed entity i 2 po)
for ent in doc.ents:
    start = ent.start - 2
    end = ent.end + 2

    start = max(0, start)
    end = min(len(doc), end)

    print(f"{ent.text} -> {' '.join([token.text for token in doc[start:end]])}")

the United States -> government of the United States announced new
Washington -> conference in Washington , D.C.
D.C. -> Washington , D.C. 
 President
Joe Biden -> 
 President Joe Biden emphasized the
Europe -> , in Europe , the
the European Union -> Europe , the European Union unveiled plans
billions -> to invest billions in renewable
Asia -> 
 In Asia , China
China -> Asia , China reported that
2030 -> emissions by 2030 . 



In [10]:
# Zlicz labelki i wypisz najczęściej pojawiające się
entity_labels = [ent.label_ for ent in doc.ents]
entity_freq = Counter(entity_labels)

for label, count in entity_freq.items():
  print(f"{label}: {count}")

GPE: 4
PERSON: 1
LOC: 2
ORG: 1
CARDINAL: 1
DATE: 1


In [11]:
# Zlicz wystąpenia dla każdego entity (nie labelka a text)
entity_mentions = Counter([ent.text for ent in doc.ents])

for entity, count in entity_mentions.items():
  print(f"{entity}: {count}")

the United States: 1
Washington: 1
D.C.: 1
Joe Biden: 1
Europe: 1
the European Union: 1
billions: 1
Asia: 1
China: 1
2030: 1


In [12]:
# Wizualizacja
displacy.render(doc, style="ent", jupyter=True)
