<a href="https://colab.research.google.com/github/RafalDoroz/ai/blob/main/analizaSkladni.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install spacy
!python -m spacy download en_core_web_md
import spacy

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
# Lematyzacja

# Wczytanie modelu spaCy do języka angielskiego
nlp = spacy.load("en_core_web_sm")

text = "The children are playing in the garden."
doc = nlp(text)

print("Lematyzacja słów:")
for token in doc:
    print(f"Słowo: {token.text}, Lemma: {token.lemma_}")

Lematyzacja słów:
Słowo: The, Lemma: the
Słowo: children, Lemma: child
Słowo: are, Lemma: be
Słowo: playing, Lemma: play
Słowo: in, Lemma: in
Słowo: the, Lemma: the
Słowo: garden, Lemma: garden
Słowo: ., Lemma: .


In [13]:
# Segmentacja morfologiczna

doc = nlp("He runs quickly.")
for token in doc:
    print(f"Word: {token.text}, Morphological Analysis: {token.morph}")



Word: He, Morphological Analysis: Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs
Word: runs, Morphological Analysis: Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
Word: quickly, Morphological Analysis: 
Word: ., Morphological Analysis: PunctType=Peri


In [14]:
# Segmentacja słów

text = "Natural Language Processing is an exciting field."
doc = nlp(text)

# Wyświetl podział na słowa
words = [token.text for token in doc]
print(words)


['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'field', '.']


In [15]:
# Tokenizacja

text = "Natural Language Processing is an exciting field."
doc = nlp(text)

tokens = [token.text for token in doc if not token.is_punct]
print(tokens)


['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'field']


In [16]:
# Parsing

text = "The quick brown fox jumps over the lazy dog."
doc = nlp(text)

for token in doc:
    print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")


Word: The, Dependency: det, Head: fox
Word: quick, Dependency: amod, Head: fox
Word: brown, Dependency: amod, Head: fox
Word: fox, Dependency: nsubj, Head: jumps
Word: jumps, Dependency: ROOT, Head: jumps
Word: over, Dependency: prep, Head: jumps
Word: the, Dependency: det, Head: dog
Word: lazy, Dependency: amod, Head: dog
Word: dog, Dependency: pobj, Head: over
Word: ., Dependency: punct, Head: jumps


In [17]:
# Stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
text = "The cats are running and chasing a mouse."

# Tokenizacja za pomocą SpaCy
doc = nlp(text)
for token in doc:
    stem = stemmer.stem(token.text)
    print(f"Word: {token.text}, Stem: {stem}")


Word: The, Stem: the
Word: cats, Stem: cat
Word: are, Stem: are
Word: running, Stem: run
Word: and, Stem: and
Word: chasing, Stem: chase
Word: a, Stem: a
Word: mouse, Stem: mous
Word: ., Stem: .


In [18]:
# Kodowanie "1 z n" (one-hot encoding)
import numpy as np

# Przykładowy słownik
vocabulary = ["cat", "dog", "fish", "bird", "mouse", "snake"]

# Słowo do zakodowania
word = "bird"

# One-hot encoding
one_hot_vector = np.zeros(len(vocabulary))
index = vocabulary.index(word)  # Znajdź indeks słowa
one_hot_vector[index] = 1

print(f"Słowo: {word}")
print(f"One-hot vector: {one_hot_vector}")


Słowo: bird
One-hot vector: [0. 0. 0. 1. 0. 0.]


In [None]:
# Zanurzenia słów (word embeddings)

# Załaduj pretrenowany model SpaCy z zanurzeniami słów
nlp = spacy.load("en_core_web_md")  # Model średniej wielkości z embeddingami

# Przykładowe słowo
word1 = "cat"
word2 = "dog"
word3 = "table"
word4 = "car"

# Zanurzenie słowa
word_vector1 = nlp(word1).vector
word_vector2 = nlp(word2).vector
word_vector3 = nlp(word3).vector
word_vector4 = nlp(word4).vector

print(f"Słowo: {word}")
print(f"Zanurzenie (pierwsze 5 wymiarów): {word_vector1[:5]}")
print(f"Zanurzenie (pierwsze 5 wymiarów): {word_vector2[:5]}")
print(f"Zanurzenie (pierwsze 5 wymiarów): {word_vector3[:5]}")
print(f"Zanurzenie (pierwsze 5 wymiarów): {word_vector4[:5]}")


In [None]:
import spacy
import matplotlib.pyplot as plt

# Załaduj model NLP
nlp = spacy.load("en_core_web_md")  # Model średniej wielkości z embeddingami

# Przykładowe słowa
words = ["cat", "dog", "table", "car"]
vectors = [nlp(word).vector for word in words]

# Wyświetl zanurzenia (pierwsze 5 wymiarów)
for word, vector in zip(words, vectors):
    print(f"Słowo: {word}")
    print(f"Zanurzenie (pierwsze 5 wymiarów): {vector[:5]}")

# Tworzenie osobnych wykresów dla każdego wektora
for word, vector in zip(words, vectors):
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(vector[:5])), vector[:5], marker="o", label=f"Vector of '{word}'")
    plt.title(f"Wektor zanurzenia słowa '{word}' (pierwsze 50 wymiarów)")
    plt.xlabel("Wymiar")
    plt.ylabel("Wartość")
    plt.legend()
    plt.grid()
    plt.show()