In [25]:
import nltk
import spacy

In [26]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Silicon
[nltk_data]     computers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("I love programming in Python.")
for token in doc:
    print(token.text, token.dep_)

I nsubj
love ROOT
programming xcomp
in prep
Python pobj
. punct


In [28]:

doc = nlp("I love programming in Python.")
for token in doc:
    print(token.text, token.dep_)

I nsubj
love ROOT
programming xcomp
in prep
Python pobj
. punct


In [29]:
from nltk.corpus import wordnet
nltk.download('wordnet')

synonyms = wordnet.synsets("good")
print(synonyms[0].definition())

benefit


[nltk_data] Downloading package wordnet to C:\Users\Silicon
[nltk_data]     computers\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
#Word similarity (SpaCy)
doc1 = nlp("I have a cat.")
doc2 = nlp("I own a feline.")
print(doc1.similarity(doc2))

0.8225613236427307


  print(doc1.similarity(doc2))


In [31]:
sentence = "Can you open the window?"
# Pragmatically, it's a request, not a yes/no question
print("This is a polite request.")

This is a polite request.


In [32]:
context = {"weather": "hot"}
if context["weather"] == "hot":
    print("Turn on the fan.")  # In real NLP, AI infers actions based on context!

Turn on the fan.


In [33]:
sentences = ["I lost my keys.", "I found them in my bag."]
for s in sentences:
    print(s)

I lost my keys.
I found them in my bag.


In [34]:
# Simple co-reference
text = "John went to the market. He bought some fruits."
print("He = John")

He = John


In [35]:
# Removing stopwords with NLTK
from nltk.corpus import stopwords
nltk.download('stopwords')

words = ["I", "am", "learning", "NLP", "and", "AI"]
filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
print(filtered_words)

['learning', 'NLP', 'AI']


[nltk_data] Downloading package stopwords to C:\Users\Silicon
[nltk_data]     computers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# Removing punctuation
import string

text = "Hello!!! Are you there??"
clean_text = text.translate(str.maketrans('', '', string.punctuation))
print(clean_text)

Hello Are you there


In [37]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem("running"))

run


In [38]:
text = "HELLO WORLD"
print(text.lower())

hello world


In [39]:
text = "     NLP is awesome!     "
print(text.strip())

NLP is awesome!


In [40]:
import re

text = "Email me at example@gmail.com"
email = re.findall(r'\S+@\S+', text)
print(email)

['example@gmail.com']


In [41]:
text = "The price is $40"
price = re.findall(r'\$\d+', text)
print(price)

['$40']


In [42]:
doc = nlp("I will travel tomorrow.")
for token in doc:
    print(token.text, token.pos_)

I PRON
will AUX
travel VERB
tomorrow NOUN
. PUNCT


In [43]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to C:\Users\Silicon
[nltk_data]     computers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Silicon
[nltk_data]     computers\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [47]:
# from nltk import word_tokenize, pos_tag

# sentence = "He is playing football."
# tokens = word_tokenize(sentence)
# print(pos_tag(tokens))



In [48]:
doc = nlp("I will travel tomorrow.")
for token in doc:
    print(token.text, token.pos_)

I PRON
will AUX
travel VERB
tomorrow NOUN
. PUNCT


In [49]:
## 13. Named Entity Recognition (NER)
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [50]:
# Another
text = nlp("Barack Obama was the president of the United States.")
for entity in text.ents:
    print(entity.text, entity.label_)

Barack Obama PERSON
the United States GPE


In [55]:
# 🧩 Chunking and Chinking

## 14. Chunking
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
sentence = [("The", "DT"), ("big", "JJ"), ("dog", "NN")]
tree = cp.parse(sentence)
tree.draw()

In [58]:
# from nltk import word_tokenize, pos_tag, RegexpParser

# text = "The quick brown fox jumps"
# tokens = word_tokenize(text)
# tagged = pos_tag(tokens)

# grammar = r"NP: {<DT>?<JJ>*<NN>}"
# cp = RegexpParser(grammar)

# tree = cp.parse(tagged)
# tree.draw()


In [59]:
# # Chunk a different way
# grammar = r"NP: {<DT>?<JJ>*<NN>}"
# cp = nltk.RegexpParser(grammar)
# tagged = nltk.pos_tag(nltk.word_tokenize("The quick brown fox jumps"))
# tree = cp.parse(tagged)
# tree.draw()

In [60]:
## 15. Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running", pos="v"))
print(lemmatizer.lemmatize("better", pos="a"))

run
good


In [61]:

## 16. WordNet

synonyms = wordnet.synsets("computer")
print(synonyms[0].definition())

print(synonyms[0].examples())

a machine for performing calculations automatically
[]


In [62]:
# 🎯 Words as Features (Bag of Words Model)

## 17. BoW Example
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(["I love NLP", "NLP is amazing"])
print(vectorizer.get_feature_names_out())
print(X.toarray())

['amazing' 'is' 'love' 'nlp']
[[0 0 1 1]
 [1 1 0 1]]


In [63]:

vectorizer = CountVectorizer()
corpus = ["AI is smart", "Machine learning is part of AI"]
bow = vectorizer.fit_transform(corpus)
print(bow.toarray())

[[1 1 0 0 0 0 1]
 [1 1 1 1 1 1 0]]


In [65]:
# 📈 Feature Selection and Extraction

## 18. Feature Selection

# Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(["I like AI", "AI is the future"])
print(tfidf.get_feature_names_out())

['ai' 'future' 'is' 'like' 'the']


In [66]:
# Chi-square example (Advanced example)
from sklearn.feature_selection import chi2
import numpy as np

X = np.array([[0, 1, 0], [1, 0, 1]])
y = np.array([0, 1])
chi_scores = chi2(X, y)
print(chi_scores)

(array([1., 1., 1.]), array([0.31731051, 0.31731051, 0.31731051]))


In [67]:

# 🧠 Document Similarity

## 19. Document Similarity Example

doc1 = nlp("Cats are beautiful animals.")
doc2 = nlp("Dogs are wonderful pets.")
print(doc1.similarity(doc2))

0.8707045912742615


  print(doc1.similarity(doc2))


In [68]:

# Cosine similarity manually
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = ["I love AI", "AI loves me"]
tfidf_matrix = TfidfVectorizer().fit_transform(texts)
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
print(similarity)

[[1.         0.26055567]]


In [None]:
# 💬 All-in-One NLP Demonstration Cell (15 NLP Topics, 2 Examples Each)

import nltk
import spacy
import string
import re
import matplotlib.pyplot as plt
from nltk.corpus import wordnet, stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk, word_tokenize, RegexpParser

# Setup for NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Setup for spaCy
nlp = spacy.load("en_core_web_sm")

print("# 1. Syntax")
doc1 = nlp("She is reading a book.")
for token in doc1:
    print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")
print(nltk.pos_tag(nltk.word_tokenize("She is reading a book.")))

print("\n# 2. Semantics")
doc2 = nlp("I love machine learning.")
doc3 = nlp("I enjoy AI research.")
print("Similarity:", doc2.similarity(doc3))
print("WordNet Synonym:", wordnet.synsets("car")[0].definition())

print("\n# 3. Pragmatics")
print("Can you pass me the salt? → This is a polite request.")
context = {"weather": "rainy"}
if context["weather"] == "rainy":
    print("Take an umbrella.")

print("\n# 4. Discourse")
sents = ["She went to the market.", "She bought some fruits."]
for s in sents:
    print(s)
print("He went to the park. He played football. → Linking 'He' across sentences.")

print("\n# 5. NLP Curves and Future Directions")
years = [2000, 2010, 2020]
accuracy = [60, 80, 95]
plt.plot(years, accuracy, marker='o')
plt.title('NLP Accuracy Over the Years')
plt.xlabel('Year')
plt.ylabel('Accuracy (%)')
plt.show()

models = ["RNN", "LSTM", "BERT", "GPT-3"]
years = [2000, 2010, 2018, 2023]
plt.bar(models, years)
plt.title('Rise of Transformer Models')
plt.ylabel("Year")
plt.show()

print("\n# 6. Tokenization (NLTK and spaCy)")
tokens_nltk = nltk.word_tokenize("I love programming!")
print("NLTK Tokens:", tokens_nltk)
doc4 = nlp("I enjoy machine learning.")
print("spaCy Tokens:", [token.text for token in doc4])

print("\n# 7. Noise Removal")
tokens = nltk.word_tokenize("This is an example sentence.")
filtered = [w for w in tokens if w.lower() not in stopwords.words('english')]
print("Without Stopwords:", filtered)
doc5 = nlp("Hello!!! How are you doing?")
cleaned = [token.text for token in doc5 if token.text not in string.punctuation]
print("Without Punctuation:", cleaned)

print("\n# 8. Word & Sentence Tokenization")
doc6 = nlp("This is a sentence. Here's another one.")
print("spaCy Sentence Tokens:", [sent.text for sent in doc6.sents])
print("NLTK Word Tokens:", nltk.word_tokenize("NLTK makes tokenization easy."))

print("\n# 9. Word Segmentation")
doc7 = nlp("I am learning Natural Language Processing.")
print("spaCy Word Segmentation:", [token.text for token in doc7])
print("NLTK Word Segmentation:", nltk.word_tokenize("Natural Language Processing is awesome."))

print("\n# 10. Stemming")
stemmer = PorterStemmer()
print("Stemmed 'running':", stemmer.stem("running"))
print("Stemmed 'flies':", stemmer.stem("flies"))

print("\n# 11. Text Normalization")
print("Lowercased:", "THIS IS A SAMPLE TEXT.".lower())
print("Trimmed Spaces:", "    NLP is amazing!    ".strip())

print("\n# 12. Regular Expressions")
text1 = "Contact me at example@email.com for details."
emails = re.findall(r'\S+@\S+', text1)
print("Emails found:", emails)
text2 = "The total price is $49.99."
price = re.findall(r'\$\d+\.\d+', text2)
print("Price found:", price)

print("\n# 13. POS Tagging")
doc8 = nlp("The cat is sitting on the mat.")
print("spaCy POS Tags:")
for token in doc8:
    print(f"{token.text}: {token.pos_}")
tokens = nltk.word_tokenize("He is reading a book.")
tags = nltk.pos_tag(tokens)
print("NLTK POS Tags:", tags)

print("\n# 14. Named Entity Recognition (NER)")
doc9 = nlp("Barack Obama was born in Hawaii.")
print("spaCy NER:")
for ent in doc9.ents:
    print(ent.text, ent.label_)
sentence = "Barack Obama was the president."
tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print("NLTK NER:", tree)

print("\n# 15. Chunking and Chinking")
sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = RegexpParser(grammar)
tree = cp.parse(tags)
print("NLTK Chunk Tree (Not visualized here)")

doc10 = nlp(sentence)
print("spaCy Noun Chunks:")
for chunk in doc10.noun_chunks:
    print(chunk.text)
