In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# Sample text
text = "Natural Language Processing is fun. NLP enables computers to understand human language."

# Tokenization
nltk.download('punkt')
tokens = word_tokenize(text)

# Frequency distribution
freq_dist = FreqDist(tokens)

# Plotting
plt.figure(figsize=(10, 5))
freq_dist.plot(10, title="Word Frequency Distribution")


In [1]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "SpaCy is a powerful NLP library."

# Dependency parsing
doc = nlp(text)
for token in doc:
    print(f"{token.text} -> {token.dep_} -> {token.head.text}")


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
from textblob import TextBlob

# Sample text
text = "I love using TextBlob for text processing tasks."

# Sentiment analysis
blob = TextBlob(text)
print("Polarity:", blob.sentiment.polarity)
print("Subjectivity:", blob.sentiment.subjectivity)


In [None]:
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Apple is looking at buying U.K. startup for $1 billion."

doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = ["NLP is fun and powerful.", "Machine learning and NLP are great fields."]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

print("Feature names:", vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(X.toarray())


Feature names: ['and' 'are' 'fields' 'fun' 'great' 'is' 'learning' 'machine' 'nlp'
 'powerful']
TF-IDF Matrix:
[[0.35520009 0.         0.         0.49922133 0.         0.49922133
  0.         0.         0.35520009 0.49922133]
 [0.29017021 0.4078241  0.4078241  0.         0.4078241  0.
  0.4078241  0.4078241  0.29017021 0.        ]]


In [3]:
import nltk
from nltk.classify import NaiveBayesClassifier

# Training data
train_data = [({"contains_love": True}, "positive"),
               ({"contains_hate": True}, "negative")]

# Train the classifier
classifier = NaiveBayesClassifier.train(train_data)

# Test the classifier
test_data = {"contains_love": True}
print("Prediction:", classifier.classify(test_data))


Prediction: positive


In [4]:
from transformers import pipeline

# Load the pre-trained pipeline
classifier = pipeline("sentiment-analysis")

# Classify text
result = classifier("I love NLP with Hugging Face!")
print(result)


  torch.utils._pytree._register_pytree_node(
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998509883880615}]


In [6]:
# from transformers import pipeline

# # Load the summarizer
# summarizer = pipeline("summarization")

# # Sample text
# text = "Natural Language Processing is an essential field of artificial intelligence. It enables machines to understand human language and perform various tasks, such as text classification, sentiment analysis, and text generation."

# summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
# print(summary)


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding

# Simple RNN Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=32),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Bidirectional LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=32),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

# GRU Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=32),
    GRU(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# LSTM Model for Text Generation
model = Sequential([
    Embedding(input_dim=10000, output_dim=64),
    LSTM(128),
    Dense(10000, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, GRU, Dense

# Bi-directional GRU Model
model = Sequential([
    Bidirectional(GRU(32, return_sequences=True)),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [12]:
#THEORY
#1.
# NLTK (Natural Language Toolkit) is a comprehensive Python library used for processing and analyzing human language data (text). It provides tools for tasks such as tokenization, stemming, lemmatization, parsing, and sentiment analysis.

In [13]:
# 2. What is SpaCy and How Does It Differ from NLTK?
# SpaCy is a modern NLP library designed for efficiency and scalability, suitable for production applications.
# Differences:

# SpaCy focuses on deep learning integration, speed, and performance, while NLTK is more academic and exploratory.
# NLTK offers more granular control, whereas SpaCy has pre-trained models for tasks like NER and dependency parsing.
# 3. What is the Purpose of TextBlob in NLP?
# TextBlob is a simple NLP library built on top of NLTK. It is mainly used for tasks like part-of-speech tagging, sentiment analysis, noun phrase extraction, and text translation.

# 4. What is Stanford NLP?
# Stanford NLP is a robust NLP library developed by Stanford University. It supports a wide range of tasks like POS tagging, dependency parsing, and named entity recognition (NER) and is known for its high accuracy.

# 5. What are Recurrent Neural Networks (RNNs)?
# RNNs are neural networks designed to process sequential data by maintaining hidden states. They are commonly used in NLP for tasks like text generation, speech recognition, and machine translation.

# 6. What is the Main Advantage of Using LSTM over RNN?
# LSTM (Long Short-Term Memory) networks overcome RNNs' vanishing gradient problem, enabling them to capture long-range dependencies in sequences.

# 7. What are Bi-directional LSTMs, and How Do They Differ from Standard LSTMs?
# Bi-directional LSTMs process input sequences in both forward and backward directions, improving the model's understanding of the context from both past and future information.

# 8. What is the Purpose of a Stacked LSTM?
# A stacked LSTM is a multi-layered LSTM architecture that can capture complex patterns in sequential data by stacking multiple LSTM layers.

# 9. How Does GRU (Gated Recurrent Unit) Differ from LSTM?
# GRU is a simpler variation of LSTM with fewer parameters. It combines the forget and input gates into a single update gate, making it computationally more efficient.

# 10. What are the Key Features of NLTK's Tokenization Process?
# Supports word and sentence tokenization.
# Handles complex patterns like punctuation and special characters.
# Provides customizable regex-based tokenizers.
# 11. How Do You Perform Named Entity Recognition (NER) Using SpaCy?

# import spacy

# nlp = spacy.load("en_core_web_sm")
# doc = nlp("Apple is looking at buying a startup in the UK for $1 billion.")
# for ent in doc.ents:
#     print(f"{ent.text}: {ent.label_}")


In [None]:
# 12. What is Word2Vec and How Does It Represent Words?
# Word2Vec is a neural network-based model that represents words as dense vectors in a continuous vector space, capturing semantic relationships between words.

# 13. Difference Between Bag of Words (BoW) and Word2Vec
# BoW: Represents text as a sparse vector of word frequencies without semantic relationships.
# Word2Vec: Encodes semantic meanings, representing words as dense vectors based on their contexts.

# 14. How Does TextBlob Handle Sentiment Analysis?
# TextBlob calculates sentiment using polarity (range -1 to 1) and subjectivity (range 0 to 1).

# 15. How Would You Implement Text Preprocessing Using NLTK?
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")

# text = "NLTK is a powerful tool for text preprocessing in NLP."

# Tokenization
# tokens = word_tokenize(text)

# # Remove stopwords
# stop_words = set(stopwords.words("english"))
# filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# # Lemmatization
# lemmatizer = WordNetLemmatizer()
# lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# print("Preprocessed Text:", lemmatized_tokens)

# 16. How Do You Train a Custom NER Model Using SpaCy?
# To train a custom NER model, you must provide labeled training data and fine-tune SpaCy's existing pipeline. This involves defining entities and using nlp.update() for training.

# 17. What is the Role of the Attention Mechanism in LSTMs and GRUs?
# The attention mechanism allows models to focus on specific parts of input sequences, improving performance in tasks like machine translation and text summarization.

# 18. Difference Between Tokenization and Lemmatization in NLP
# Tokenization: Splits text into smaller units (words or sentences).
# Lemmatization: Reduces words to their base form (lemma).

# 19. How Do You Perform Text Normalization in NLP?
# Text normalization involves:

# Lowercasing text
# Removing punctuation
# Expanding contractions
# Lemmatization or stemming

# 20. What is the Purpose of Frequency Distribution in NLP?
# It helps analyze the most common words in a corpus, identify patterns, and extract meaningful insights.

# 21. What are Co-occurrence Vectors in NLP?
# Co-occurrence vectors capture the relationship between words by measuring how often they appear together in a given context.

# 22. How is Word2Vec Used to Find the Relationship Between Words?
# Word2Vec uses cosine similarity between vectors to measure the semantic similarity between words.

