# 1. Text Preprocessing with NLTK and spaCy

In [13]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download NLTK
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('wordnet2022')
nltk.download('stopwords')

# Initializing spaCy
nlp = spacy.load("en_core_web_sm")

# temp fix for lookup error.
! cp -rf /usr/share/nltk_data/corpora/wordnet2022 /usr/share/nltk_data/corpora/wordnet

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package wordnet2022 to /usr/share/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
sample_text = "In the heart of the ancient forest, where the moonlight wove silver threads through the emerald canopy, Elara whispered an incantation long forgotten by time. The air shimmered as glowing runes spiraled around her, their golden light pulsing with an unseen rhythm. A gust of wind carried the scent of lavender and old parchment, and in the silence that followed, the ancient oak before her groaned, its bark twisting to reveal a hidden doorway. Beyond it lay a realm untouched by mortal hands, where stars floated like fireflies and rivers sang in voices older than the world itself."

# NLTK tokenization
nltk_tokens = word_tokenize(sample_text)
print("NLTK Tokens:", nltk_tokens)

# spaCy tokenization
doc = nlp(sample_text)
spacy_tokens = [token.text for token in doc]
print("spaCy Tokens:", spacy_tokens)

# NLTK lemmatization and stopword removal
lemmatizer = WordNetLemmatizer()
nltk_stopwords = set(stopwords.words('english'))
nltk_processed = [lemmatizer.lemmatize(word.lower()) for word in nltk_tokens if word.lower() not in nltk_stopwords]
print("NLTK Processed:", nltk_processed)

# spaCy lemmatization and stopword removal
spacy_processed = [token.lemma_.lower() for token in doc if not token.is_stop]
print("spaCy Processed:", spacy_processed)

NLTK Tokens: ['In', 'the', 'heart', 'of', 'the', 'ancient', 'forest', ',', 'where', 'the', 'moonlight', 'wove', 'silver', 'threads', 'through', 'the', 'emerald', 'canopy', ',', 'Elara', 'whispered', 'an', 'incantation', 'long', 'forgotten', 'by', 'time', '.', 'The', 'air', 'shimmered', 'as', 'glowing', 'runes', 'spiraled', 'around', 'her', ',', 'their', 'golden', 'light', 'pulsing', 'with', 'an', 'unseen', 'rhythm', '.', 'A', 'gust', 'of', 'wind', 'carried', 'the', 'scent', 'of', 'lavender', 'and', 'old', 'parchment', ',', 'and', 'in', 'the', 'silence', 'that', 'followed', ',', 'the', 'ancient', 'oak', 'before', 'her', 'groaned', ',', 'its', 'bark', 'twisting', 'to', 'reveal', 'a', 'hidden', 'doorway', '.', 'Beyond', 'it', 'lay', 'a', 'realm', 'untouched', 'by', 'mortal', 'hands', ',', 'where', 'stars', 'floated', 'like', 'fireflies', 'and', 'rivers', 'sang', 'in', 'voices', 'older', 'than', 'the', 'world', 'itself', '.']
spaCy Tokens: ['In', 'the', 'heart', 'of', 'the', 'ancient', 'fo

NLTK оставляет некоторые слова неизменёнными (например, "whispered", "shimmered", "twisting").
spaCy меняет слова более кардинально ("forgotten" → "forget", "glowing" → "glow", "spiraled" → "spiral"), тогда как NLTK этого не делает.

# 2. Named Entity Recognition (NER) with spaCy

In [15]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

text = "Elon Musk, the CEO of Tesla and SpaceX, announced that the new Tesla Model S will be released in September 2025. The event took place in San Francisco, where hundreds of journalists attended. Microsoft and Google are also investing heavily in AI research, with a projected budget of $50 billion by 2030."

# Application of spaCy for NER
doc = nlp(text)

# Output of found named entities
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Visualization of named entities
displacy.render(doc, style="ent", jupyter=True)

Named Entities:
Elon Musk (PERSON)
Tesla (ORG)
Tesla Model S (PERSON)
September 2025 (DATE)
San Francisco (GPE)
hundreds (CARDINAL)
Microsoft (ORG)
Google (ORG)
AI (GPE)
$50 billion (MONEY)
2030 (DATE)


# 3. Text Vectorization using Transformers

In [16]:
from transformers import BertTokenizer, BertModel
import torch

# Loading the pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

text = "Machine learning is transforming the world of artificial intelligence."

# Tokenization
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Obtaining the model's hidden states
with torch.no_grad():
    outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state

# Extracting embeddings
word_embeddings = hidden_states.squeeze(0)
print("Word embeddings shape:", word_embeddings.shape)

print("First 5 word embeddings:")
print(word_embeddings[:5])

Word embeddings shape: torch.Size([12, 768])
First 5 word embeddings:
tensor([[-0.2406, -0.0205, -0.1639,  ..., -0.5280,  0.0740,  0.4706],
        [ 0.2976,  0.2279, -0.1190,  ..., -0.0478,  0.5451,  0.3846],
        [-0.1905,  0.2525, -0.1623,  ..., -1.0566,  0.3143,  0.4100],
        [-0.1254,  0.2789, -0.3254,  ...,  0.0573, -0.1851,  0.3142],
        [-0.1019, -0.4838,  0.0465,  ..., -0.5256, -0.1013, -0.0301]])


# 4. Sentiment Analysis with Transformers

In [17]:
from transformers import pipeline

# Loading the pre-trained sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

sentences = [
    "I absolutely love this product! It's amazing.",
    "The service was terrible, I will never come back.",
    "It's an okay experience, nothing special but not bad either.",
    "This is the worst movie I have ever seen.",
    "The food was delicious and the staff was very friendly."
]

# Performing sentiment analysis on the sentences
results = sentiment_analyzer(sentences)

for sentence, result in zip(sentences, results):
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {result['label']}, Confidence: {result['score']:.4f}\n")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentence: I absolutely love this product! It's amazing.
Sentiment: POSITIVE, Confidence: 0.9999

Sentence: The service was terrible, I will never come back.
Sentiment: NEGATIVE, Confidence: 0.9990

Sentence: It's an okay experience, nothing special but not bad either.
Sentiment: POSITIVE, Confidence: 0.9951

Sentence: This is the worst movie I have ever seen.
Sentiment: NEGATIVE, Confidence: 0.9998

Sentence: The food was delicious and the staff was very friendly.
Sentiment: POSITIVE, Confidence: 0.9999

