In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = """
Marie Curie, a pioneering physicist and chemist, discovered the elements polonium and radium in 1898 while working at the University of Paris.
Her research on radioactivity earned her the Nobel Prize in Physics in 1903 and another in Chemistry in 1911.
Curie later founded the Radium Institute in Warsaw, which became a major center for nuclear research.
The theory of relativity was developed by Albert Einstein while employed at the Swiss Patent Office.
In 1905, groundbreaking research on quantum mechanics was published by the Institute for Advanced Study.
The Nobel Prize in Physics was awarded to Einstein in 1921.
"""


doc = nlp(text)

In [3]:
# NER
for ent in doc.ents:
   print(f"{ent.text} -> {ent.label_}")

Marie Curie -> PERSON
1898 -> DATE
the University of Paris -> ORG
the Nobel Prize in Physics -> WORK_OF_ART
1903 -> DATE
Chemistry -> ORG
1911 -> DATE
the Radium Institute -> ORG
Warsaw -> GPE
Albert Einstein -> PERSON
the Swiss Patent Office -> ORG
1905 -> DATE
the Institute for Advanced Study -> ORG
The Nobel Prize in Physics -> WORK_OF_ART
Einstein -> PERSON
1921 -> DATE


In [4]:
# Znajdź pary rzeczownik-przymiotnik
noun_adj_pairs = []
for token in doc:
    if token.dep_ == "amod" and token.head.pos_ == "NOUN":
        noun_adj_pairs.append((token.head.text, token.text))

for noun, adj in noun_adj_pairs:
    print(f"Noun: {noun:<15} -> Adjective: {adj}")

Noun: physicist       -> Adjective: pioneering
Noun: center          -> Adjective: major
Noun: research        -> Adjective: nuclear


In [6]:
# Znajdź trójki SVO
def extract_svo(doc):
    svo_triples = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            subject = token.text
            verb = token.head.text
            obj = None
            # Find direct object
            for child in token.head.children:
                if child.dep_ == "dobj":
                    obj = child.text
                    break
            if obj:
                svo_triples.append((subject, verb, obj))
    return svo_triples

print("Subject-Verb-Object:")
for s, v, o in extract_svo(doc):
    print(f"{s} -> {v} -> {o}")

Subject-Verb-Object:
Curie -> discovered -> elements
research -> earned -> Prize
Curie -> founded -> Institute


In [8]:
# Zbadaj niejednoznaczność
word_tokens = {}
ambiguous_words = set()

for token in doc:
    word = token.text.lower()
    if word in word_tokens and word_tokens[word] != token.pos_:
        ambiguous_words.add(word)
    else:
        word_tokens[word] = token.pos_

print("Ambiguous words:", ambiguous_words)

Ambiguous words: {'radium'}


In [7]:
# Wskaż zdania z mową bierną
for sent in doc.sents:
       for token in sent:
           # Szukamy czasownika w formie biernej (auxpass + past participle)
           if token.dep_ == "auxpass" and token.head.pos_ == "VERB" and token.head.tag_ == "VBN":
               print(f"Passive voice detected in sentence: {sent.text}")
               break


Passive voice detected in sentence: The theory of relativity was developed by Albert Einstein while employed at the Swiss Patent Office.

Passive voice detected in sentence: In 1905, groundbreaking research on quantum mechanics was published by the Institute for Advanced Study.

Passive voice detected in sentence: The Nobel Prize in Physics was awarded to Einstein in 1921.



In [None]:
# Zadanie z gwiazdką: Zbuduj oś czasu z podanego tekstu. Powiąż daty z wydarzeniami
# za pomocą poznanych technik i przedstaw je w kolejności chronologicznej


In [9]:
# Skompresuj text
def compress_sentence(doc):
    compressed = []
    for token in doc:
        # Keep the root verb (main action)
        if token.dep_ == "ROOT":
            compressed.append(token.text)

        # Keep the subject and its modifiers
        #if token.dep_ == "nsubj" or token.head.dep_ == "nsubj":
            #compressed.append(token.text)

        # Keep the direct object and its modifiers
        if token.dep_ == "dobj" or token.head.dep_ == "dobj":
            compressed.append(token.text)

        # Keep  entities
        if token.ent_type_ in ("PERSON", "ORG", "GPE", "DATE", "PRODUCT"):
            compressed.append(token.text)

    return " ".join(compressed)

# Compress the sentence
compressed_sentence = compress_sentence(doc)
print("Original Sentence:")
print(doc.text)
print("\nCompressed Sentence:")
print(compressed_sentence)

Original Sentence:

Marie Curie, a pioneering physicist and chemist, discovered the elements polonium and radium in 1898 while working at the University of Paris.
Her research on radioactivity earned her the Nobel Prize in Physics in 1903 and another in Chemistry in 1911.
Curie later founded the Radium Institute in Warsaw, which became a major center for nuclear research.
The theory of relativity was developed by Albert Einstein while employed at the Swiss Patent Office.
In 1905, groundbreaking research on quantum mechanics was published by the Institute for Advanced Study.
The Nobel Prize in Physics was awarded to Einstein in 1921.


Compressed Sentence:
Marie Curie discovered the elements 1898 the University of Paris earned the Nobel Prize in 1903 Chemistry 1911 founded the the Radium Radium Institute Institute in Warsaw , became developed Albert Einstein the Swiss Patent Office 1905 research on published the Institute for Advanced Study awarded Einstein 1921


In [12]:
# Wizualizacje
from spacy import displacy

displacy.render(doc, style="dep", options={"distance": 100}, jupyter=True)

In [15]:
# Dla chętnych: zamień stronę bierną na aktywną (passive voice na active)
def passive_to_active(doc):
    active_sentences = []
    for sent in doc.sents:
        passive_found = False
        for token in sent:
            if token.dep_ == "auxpass" and token.head.pos_ == "VERB" and token.head.tag_ == "VBN":
                passive_found = True

                subject = [child for child in token.head.children if child.dep_ == "agent"]
                object_ = [child for child in token.head.children if child.dep_ in ("nsubjpass", "dobj")]

                if subject and object_:
                    subject = subject[0].text
                    object_ = object_[0].text

                    # Podświetlenie zmienionego fragmentu
                    active_sentence = f"<i>{subject} {token.head.lemma_} {object_}</i>"
                    active_sentences.append(active_sentence)
                else:
                    active_sentences.append(sent.text)
                break

        if not passive_found:
            active_sentences.append(sent.text)

    return " ".join(active_sentences)


active_text = passive_to_active(doc)
print("Active voice:", active_text)

Active voice: 
 Marie Curie, a pioneering physicist and chemist, discovered the elements polonium and radium in 1898 while working at the University of Paris.
 Her research on radioactivity earned her the Nobel Prize in Physics in 1903 and another in Chemistry in 1911.
 Curie later founded the Radium Institute in Warsaw, which became a major center for nuclear research.
 <i>by develop theory</i> In 1905, groundbreaking research on quantum mechanics was published by the Institute for Advanced Study.
 The Nobel Prize in Physics was awarded to Einstein in 1921.

