In [7]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import pipeline, BertTokenizer, BertModel
import torch
from spacy import displacy

nltk.download('punkt_tab')
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text
text = "Two days ago, at midnight, the god Apollo had visited Lucia. He had offered her a choice: gain the ability to see what will happen, but lose the ability to lie. So, she had to tell her master everything."

# Initialize NLTK tools
nltk_lemmatizer = WordNetLemmatizer()
nltk_stopwords = set(stopwords.words("english"))

# Tokenize, Lemmatize, and Remove Stopwords using NLTK
nltk_tokens = word_tokenize(text)
nltk_lemmatized = [nltk_lemmatizer.lemmatize(token) for token in nltk_tokens]
nltk_filtered = [token for token in nltk_tokens if token.lower() not in nltk_stopwords]

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Tokenize, Lemmatize, and Remove Stopwords using spaCy
spacy_tokens = [token.text for token in doc]
spacy_lemmatized = [token.lemma_ for token in doc]
spacy_filtered = [token.text for token in doc if not token.is_stop]

# Print Results
print("NLTK - Tokenized:", nltk_tokens)
print("spaCy - Tokenized:", spacy_tokens)

print("\nNLTK - Lemmatized:", nltk_lemmatized)
print("spaCy - Lemmatized:", spacy_lemmatized)

print("\nNLTK - Stopword Removed:", nltk_filtered)
print("spaCy - Stopword Removed:", spacy_filtered)

# Named Entity Recognition (NER) с использованием spaCy
#spacy_ner = [(ent.text, ent.label_) for ent in doc.ents]
#print("\nNamed Entity Recognition (NER):", spacy_ner)
for ent in doc.ents:
    print(f"\n{ent.text} - {ent.label_}")

# Визуализируем Named Entities с помощью displacy
displacy.render(doc, style='ent', page=True)

# Токенизация и извлечение эмбеддингов с использованием BERT (transformers)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Токенизация текста
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

# Извлекаем эмбеддинги для каждого токена
embeddings = outputs.last_hidden_state
print("\nWord Embeddings for Tokens (BERT):")
print(embeddings)

with torch.no_grad():  # Disable gradient calculation (saves memory)
    outputs = model(**inputs)

print("\nShape of hidden states:", embeddings.shape)


# Анализ настроений с использованием Hugging Face pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

# Пример текста для анализа настроений
sentiment_result = sentiment_analyzer("I love programming with Python!")

# Выводим результат анализа настроений
print("\nSentiment Analysis Result:", sentiment_result)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NLTK - Tokenized: ['Two', 'days', 'ago', ',', 'at', 'midnight', ',', 'the', 'god', 'Apollo', 'had', 'visited', 'Lucia', '.', 'He', 'had', 'offered', 'her', 'a', 'choice', ':', 'gain', 'the', 'ability', 'to', 'see', 'what', 'will', 'happen', ',', 'but', 'lose', 'the', 'ability', 'to', 'lie', '.', 'So', ',', 'she', 'had', 'to', 'tell', 'her', 'master', 'everything', '.']
spaCy - Tokenized: ['Two', 'days', 'ago', ',', 'at', 'midnight', ',', 'the', 'god', 'Apollo', 'had', 'visited', 'Lucia', '.', 'He', 'had', 'offered', 'her', 'a', 'choice', ':', 'gain', 'the', 'ability', 'to', 'see', 'what', 'will', 'happen', ',', 'but', 'lose', 'the', 'ability', 'to', 'lie', '.', 'So', ',', 'she', 'had', 'to', 'tell', 'her', 'master', 'everything', '.']

NLTK - Lemmatized: ['Two', 'day', 'ago', ',', 'at', 'midnight', ',', 'the', 'god', 'Apollo', 'had', 'visited', 'Lucia', '.', 'He', 'had', 'offered', 'her', 'a', 'choice', ':', 'gain', 'the', 'ability', 'to', 'see', 'what', 'will', 'happen', ',', 'but', '


Word Embeddings for Tokens (BERT):
tensor([[[-0.0774,  0.0195, -0.4101,  ...,  0.3049,  0.0917,  0.5279],
         [ 0.3108, -0.3685, -0.2091,  ...,  0.1433,  0.5911,  0.1472],
         [-0.0216, -0.3929,  0.1241,  ..., -0.4278, -0.2693, -0.3701],
         ...,
         [ 0.0381, -0.2653, -0.3706,  ...,  0.8441, -0.1329,  0.8563],
         [-0.0147, -0.4700, -0.2509,  ...,  0.5974,  0.3383, -0.5641],
         [-0.2832, -0.0934, -0.2271,  ...,  0.6953, -0.2552,  0.0585]]],
       grad_fn=<NativeLayerNormBackward0>)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.



Shape of hidden states: torch.Size([1, 49, 768])


Device set to use cpu



Sentiment Analysis Result: [{'label': 'POSITIVE', 'score': 0.9994648098945618}]
