In [1]:
#SETUP & INSTALLATION

import nltk
import spacy
import sklearn
import pandas as pd
import numpy as np

print(f"NLTK version: {nltk.__version__}")
print(f"spaCy version: {spacy.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

# Test spaCy model
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world!")
print(f"spaCy model loaded: {nlp.meta['name']}")


NLTK version: 3.9.2
spaCy version: 3.8.11
scikit-learn version: 1.8.0
spaCy model loaded: core_web_sm


In [2]:
#Ex1
#1.1

import nltk
# Download required NLTK data (run once)
nltk.data.path.append(r"C:\Users\prosi\AppData\Roaming\nltk_data")

from nltk.tokenize import sent_tokenize


# nltk.download('punkt')

# Sample text
text = """Natural Language Processing is fascinating. It enables computers
to understand human language.
Dr. Smith works at N.A.S.A. on text analysis. He said, "NLP is the
future!"
What do you think? Visit www.nlp.org for more info."""

# Tokenize into sentences
sentences = sent_tokenize(text)
print(f"Number of sentences: {len(sentences)}\n")
for i, sent in enumerate(sentences, 1):
 print(f"Sentence {i}: {sent}")

Number of sentences: 7

Sentence 1: Natural Language Processing is fascinating.
Sentence 2: It enables computers
to understand human language.
Sentence 3: Dr. Smith works at N.A.S.A.
Sentence 4: on text analysis.
Sentence 5: He said, "NLP is the
future!"
Sentence 6: What do you think?
Sentence 7: Visit www.nlp.org for more info.


In [None]:
#1.2
from nltk.tokenize import word_tokenize
sentence = "Don't forget: pre-processing costs $100-$200! Email john@example.com"

# Tokenize into words
tokens = word_tokenize(sentence)
print(f"Number of tokens: {len(tokens)}\n")
print("Tokens:", tokens)


Number of tokens: 15

Tokens: ['Do', "n't", 'forget', ':', 'pre-processing', 'costs', '$', '100-', '$', '200', '!', 'Email', 'john', '@', 'example.com']


In [None]:
#1.3
from nltk.tokenize import word_tokenize, wordpunct_tokenize,TreebankWordTokenizer
text = "We're analyzing BERT's performance on GPT-3.5. Wow!"

# Different tokenizers
standard = word_tokenize(text)
wordpunct = wordpunct_tokenize(text)
treebank = TreebankWordTokenizer().tokenize(text)
print("Standard word_tokenize:")
print(standard)
print(f"\nWordPunct tokenizer:")
print(wordpunct)
print(f"\nTreebank tokenizer:")
print(treebank)


Standard word_tokenize:
['We', "'re", 'analyzing', 'BERT', "'s", 'performance', 'on', 'GPT-3.5', '.', 'Wow', '!']

WordPunct tokenizer:
['We', "'", 're', 'analyzing', 'BERT', "'", 's', 'performance', 'on', 'GPT', '-', '3', '.', '5', '.', 'Wow', '!']

Treebank tokenizer:
['We', "'re", 'analyzing', 'BERT', "'s", 'performance', 'on', 'GPT-3.5.', 'Wow', '!']


In [5]:
#1.4
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Apple Inc. is looking at buying U.K. startup for $1 billion. CEO Tim Cook confirmed it."
doc = nlp(text)
print("Tokens with POS tags and lemmas:\n")
for token in doc:
 print(f"{token.text:15} | POS: {token.pos_:8} | Lemma:{token.lemma_:15} | Is_alpha: {token.is_alpha}")

Tokens with POS tags and lemmas:

Apple           | POS: PROPN    | Lemma:Apple           | Is_alpha: True
Inc.            | POS: PROPN    | Lemma:Inc.            | Is_alpha: False
is              | POS: AUX      | Lemma:be              | Is_alpha: True
looking         | POS: VERB     | Lemma:look            | Is_alpha: True
at              | POS: ADP      | Lemma:at              | Is_alpha: True
buying          | POS: VERB     | Lemma:buy             | Is_alpha: True
U.K.            | POS: PROPN    | Lemma:U.K.            | Is_alpha: False
startup         | POS: VERB     | Lemma:startup         | Is_alpha: True
for             | POS: ADP      | Lemma:for             | Is_alpha: True
$               | POS: SYM      | Lemma:$               | Is_alpha: False
1               | POS: NUM      | Lemma:1               | Is_alpha: False
billion         | POS: NUM      | Lemma:billion         | Is_alpha: True
.               | POS: PUNCT    | Lemma:.               | Is_alpha: False
CEO         

In [None]:
#EXERCISE 1: Your Turn ?????????
text = """
The COVID-19 pandemic started in 2019-2020. Dr. Johnson said, "We're
making progress!"
The vaccine costs €50-€100 in the E.U. Visit https://who.int for updates.
"""

from nltk.tokenize import sent_tokenize, word_tokenize

# 1. Sentence count
sentences = sent_tokenize(text)
print(f"Sentences: {len(sentences)}")

# 2. Word tokens
tokens = word_tokenize(text)
print(f"Total tokens: {len(tokens)}")

# 3. Unique tokens
unique_tokens = set(tokens)
print(f"Unique tokens (types): {len(unique_tokens)}")

# 4. Check URL
print(f"\nURL handling:")
print([t for t in tokens if 'http' in t.lower()])

Sentences: 4
Total tokens: 33
Unique tokens (types): 29

URL handling:
['https']


In [7]:
#EXERCISE 2: Bag-of-Words (BoW)
#2.1 Manual BoW Construction

from collections import Counter
import pandas as pd

# Sample documents
documents = [
 "The cat sat on the mat",
 "The dog sat on the log",
 "Cats and dogs are enemies"
]

# Tokenize all documents
all_tokens = []
for doc in documents:
 tokens = word_tokenize(doc.lower())
 all_tokens.append(tokens)
print("Tokenized documents:")
for i, tokens in enumerate(all_tokens):
 print(f"Doc {i+1}: {tokens}")

# Build vocabulary
vocabulary = sorted(set([token for doc in all_tokens for token in doc]))
print(f"\nVocabulary (|V| = {len(vocabulary)}):")
print(vocabulary)

# Create BoW matrix
bow_matrix = []
for tokens in all_tokens:
 counts = Counter(tokens)
 bow_vector = [counts.get(word, 0) for word in vocabulary]
 bow_matrix.append(bow_vector)

# Display as DataFrame
df = pd.DataFrame(bow_matrix, columns=vocabulary, index=[f"Doc {i+1}" for
i in range(len(documents))])
print("\nBag-of-Words Matrix:")
print(df)

Tokenized documents:
Doc 1: ['the', 'cat', 'sat', 'on', 'the', 'mat']
Doc 2: ['the', 'dog', 'sat', 'on', 'the', 'log']
Doc 3: ['cats', 'and', 'dogs', 'are', 'enemies']

Vocabulary (|V| = 12):
['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'enemies', 'log', 'mat', 'on', 'sat', 'the']

Bag-of-Words Matrix:
       and  are  cat  cats  dog  dogs  enemies  log  mat  on  sat  the
Doc 1    0    0    1     0    0     0        0    0    1   1    1    2
Doc 2    0    0    0     0    1     0        0    1    0   1    1    2
Doc 3    1    1    0     1    0     1        1    0    0   0    0    0


In [None]:
#2.2 scikit-learn CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
documents = [
 "The cat sat on the mat",
 "The dog sat on the log",
 "Cats and dogs are enemies"
]

# Create CountVectorizer
vectorizer = CountVectorizer(lowercase=True)

# Fit and transform
bow_matrix = vectorizer.fit_transform(documents)

# Get vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Convert to DataFrame
df = pd.DataFrame(bow_matrix.toarray(), columns=vocabulary, index=[f"Doc{i+1}" for i in range(len(documents))])
print("Bag-of-Words Matrix (CountVectorizer):")
print(df)
print(f"\nVocabulary size: {len(vocabulary)}")
print(f"Matrix shape: {bow_matrix.shape}")
print(f"Matrix sparsity: {100 * (1 - bow_matrix.nnz / (bow_matrix.shape[0]
* bow_matrix.shape[1])):.1f}%")

Bag-of-Words Matrix (CountVectorizer):
      and  are  cat  cats  dog  dogs  enemies  log  mat  on  sat  the
Doc1    0    0    1     0    0     0        0    0    1   1    1    2
Doc2    0    0    0     0    1     0        0    1    0   1    1    2
Doc3    1    1    0     1    0     1        1    0    0   0    0    0

Vocabulary size: 12
Matrix shape: (3, 12)
Matrix sparsity: 58.3%


In [None]:
# BoW with N-grams
# Unigrams only
vectorizer_unigram = CountVectorizer(ngram_range=(1, 1))
bow_unigram = vectorizer_unigram.fit_transform(documents)

# Unigrams + Bigrams
vectorizer_bigram = CountVectorizer(ngram_range=(1, 2))
bow_bigram = vectorizer_bigram.fit_transform(documents)
print("Unigram vocabulary:")
print(vectorizer_unigram.get_feature_names_out())
print(f"Size: {len(vectorizer_unigram.get_feature_names_out())}")
print("\nUnigram + Bigram vocabulary:")
print(vectorizer_bigram.get_feature_names_out())
print(f"Size: {len(vectorizer_bigram.get_feature_names_out())}")

# Display bigram matrix
df_bigram = pd.DataFrame(
 bow_bigram.toarray(),
 columns=vectorizer_bigram.get_feature_names_out(),
 index=[f"Doc {i+1}" for i in range(len(documents))]
)
print("\nBigram BoW Matrix:")
print(df_bigram)


Unigram vocabulary:
['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'enemies' 'log' 'mat' 'on' 'sat'
 'the']
Size: 12

Unigram + Bigram vocabulary:
['and' 'and dogs' 'are' 'are enemies' 'cat' 'cat sat' 'cats' 'cats and'
 'dog' 'dog sat' 'dogs' 'dogs are' 'enemies' 'log' 'mat' 'on' 'on the'
 'sat' 'sat on' 'the' 'the cat' 'the dog' 'the log' 'the mat']
Size: 24

Bigram BoW Matrix:
       and  and dogs  are  are enemies  cat  cat sat  cats  cats and  dog  \
Doc 1    0         0    0            0    1        1     0         0    0   
Doc 2    0         0    0            0    0        0     0         0    1   
Doc 3    1         1    1            1    0        0     1         1    0   

       dog sat  ...  mat  on  on the  sat  sat on  the  the cat  the dog  \
Doc 1        0  ...    1   1       1    1       1    2        1        0   
Doc 2        1  ...    0   1       1    1       1    2        0        1   
Doc 3        0  ...    0   0       0    0       0    0        0        0   

       the l

In [None]:
#EXERCISE 2: Your Turn

reviews = [
 "This movie is amazing and wonderful",
 "Terrible film, waste of time",
 "Great acting but boring plot",
 "Amazing cinematography and great story"
]
# TODO: Your code here
# 1. Create BoW matrix with CountVectorizer
# 2. Find the most common word across all reviews
# 3. Calculate vocabulary size
# 4. Compute sparsity

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# 1. Create BoW
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(reviews)

# 2. Most common word
word_counts = bow.toarray().sum(axis=0)
vocab = vectorizer.get_feature_names_out()
most_common_idx = word_counts.argmax()
print(f"Most common word: '{vocab[most_common_idx]}' (count:{word_counts[most_common_idx]})")

# 3. Vocabulary size
print(f"Vocabulary size: {len(vocab)}")

# 4. Sparsity
sparsity = 100 * (1 - bow.nnz / (bow.shape[0] * bow.shape[1]))
print(f"Sparsity: {sparsity:.1f}%")

# Display matrix
df = pd.DataFrame(bow.toarray(), columns=vocab, index=[f"Review {i+1}" for
i in range(len(reviews))])
print("\nBoW Matrix:")
print(df)


Most common word: 'amazing' (count:2)
Vocabulary size: 18
Sparsity: 70.8%

BoW Matrix:
          acting  amazing  and  boring  but  cinematography  film  great  is  \
Review 1       0        1    1       0    0               0     0      0   1   
Review 2       0        0    0       0    0               0     1      0   0   
Review 3       1        0    0       1    1               0     0      1   0   
Review 4       0        1    1       0    0               1     0      1   0   

          movie  of  plot  story  terrible  this  time  waste  wonderful  
Review 1      1   0     0      0         0     1     0      0          1  
Review 2      0   1     0      0         1     0     1      1          0  
Review 3      0   0     1      0         0     0     0      0          0  
Review 4      0   0     0      1         0     0     0      0          0  


In [None]:
#EXERCISE 3: TF-IDF

import numpy as np
import math
documents = [
 "the cat sat on the mat",
 "the dog sat on the log",
 "cats and dogs are friends"
]
# Tokenize
tokenized = [doc.lower().split() for doc in documents]

# Build vocabulary
vocab = sorted(set([word for doc in tokenized for word in doc]))
print(f"Vocabulary: {vocab}\n")

# Calculate TF (Term Frequency)
def calculate_tf(doc_tokens, vocab):
 tf = {}
 doc_length = len(doc_tokens)
 for word in vocab:
  tf[word] = doc_tokens.count(word) / doc_length
 return tf

# Calculate IDF (Inverse Document Frequency)
def calculate_idf(tokenized_docs, vocab):
 idf = {}
 N = len(tokenized_docs)
 for word in vocab:
  doc_count = sum(1 for doc in tokenized_docs if word in doc)
 idf[word] = math.log(N / doc_count)
 return idf

# Calculate TF for first document
tf_doc1 = calculate_tf(tokenized[0], vocab)
print("TF for Document 1:")
for word, score in sorted(tf_doc1.items(), key=lambda x: x[1],
reverse=True)[:5]:
 print(f" {word:10s}: {score:.3f}")
 
# Calculate IDF
idf = calculate_idf(tokenized, vocab)
print("\nIDF scores:")
for word, score in sorted(idf.items(), key=lambda x: x[1], reverse=True) [:5]:print(f" {word:10s}: {score:.3f}")
 
# Calculate TF-IDF for Document 1
print("\nTF-IDF for Document 1:")
tfidf_doc1 = {word: tf_doc1[word] * idf[word] for word in vocab} ##tf_doc1.get(word, 0) * idf.get(word, 0)
for word, score in sorted(tfidf_doc1.items(), key=lambda x: x[1],
reverse=True)[:5]:
 print(f" {word:10s}: {score:.3f}")



Vocabulary: ['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'friends', 'log', 'mat', 'on', 'sat', 'the']

TF for Document 1:
 the       : 0.333
 cat       : 0.167
 mat       : 0.167
 on        : 0.167
 sat       : 0.167

IDF scores:
 the       : 0.405

TF-IDF for Document 1:


KeyError: 'and'

In [28]:
#3.2 scikit-learn TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
documents = [
 "the cat sat on the mat",
 "the dog sat on the log",
 "cats and dogs are friends"
]

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(lowercase=True, norm=None) # norm=None for raw TF-IDF
    
# Fit and transform
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Display as DataFrame
vocab = tfidf_vectorizer.get_feature_names_out()
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vocab, index=[f"Doc{i+1}" for i in range(len(documents))])
print("TF-IDF Matrix:")
print(df.round(3))

# Show highest TF-IDF words per document
print("\nTop 3 words per document:")
for i, doc in enumerate(documents):
 tfidf_scores = tfidf_matrix[i].toarray().flatten()
 top_indices = tfidf_scores.argsort()[-3:][::-1]
 top_words = [(vocab[idx], tfidf_scores[idx]) for idx in top_indices]
 print(f"Doc {i+1}: {doc}")
 for word, score in top_words:
  print(f" {word:10s}: {score:.3f}")


TF-IDF Matrix:
        and    are    cat   cats    dog   dogs  friends    log    mat     on  \
Doc1  0.000  0.000  1.693  0.000  0.000  0.000    0.000  0.000  1.693  1.288   
Doc2  0.000  0.000  0.000  0.000  1.693  0.000    0.000  1.693  0.000  1.288   
Doc3  1.693  1.693  0.000  1.693  0.000  1.693    1.693  0.000  0.000  0.000   

        sat    the  
Doc1  1.288  2.575  
Doc2  1.288  2.575  
Doc3  0.000  0.000  

Top 3 words per document:
Doc 1: the cat sat on the mat
 the       : 2.575
 mat       : 1.693
 cat       : 1.693
Doc 2: the dog sat on the log
 the       : 2.575
 log       : 1.693
 dog       : 1.693
Doc 3: cats and dogs are friends
 cats      : 1.693
 are       : 1.693
 dogs      : 1.693


In [31]:
#3.3 Comparing BoW vs TF-IDF

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import matplotlib.pyplot as plt
documents = [
 "Python is a great programming language for data science",
 "Java is also a programming language",
 "Data science requires knowledge of statistics and machine learning",
 "Machine learning is a subset of artificial intelligence"
]

# BoW
bow_vec = CountVectorizer()
bow_matrix = bow_vec.fit_transform(documents)

# TF-IDF
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(documents)

# Compare scores for "programming"
vocab_bow = bow_vec.get_feature_names_out()
vocab_tfidf = tfidf_vec.get_feature_names_out()
word = "programming"
bow_idx = list(vocab_bow).index(word)
tfidf_idx = list(vocab_tfidf).index(word)
print(f"Scores for word '{word}':")
print(f" BoW: {bow_matrix[:, bow_idx].toarray().flatten()}")
print(f" TF-IDF: {tfidf_matrix[:,
tfidf_idx].toarray().flatten().round(3)}")

# Compare common words
word_common = "is"
bow_idx = list(vocab_bow).index(word_common)
tfidf_idx = list(vocab_tfidf).index(word_common)
print(f"\nScores for word '{word_common}':")
print(f" BoW: {bow_matrix[:, bow_idx].toarray().flatten()}")
print(f" TF-IDF: {tfidf_matrix[:,
tfidf_idx].toarray().flatten().round(3)}")


Scores for word 'programming':
 BoW: [1 1 0 0]
 TF-IDF: [0.325 0.413 0.    0.   ]

Scores for word 'is':
 BoW: [1 1 0 1]
 TF-IDF: [0.263 0.334 0.    0.278]


In [32]:
#EXERCISE 3: Your Turn
documents = [
 "Machine learning algorithms require large datasets",
 "Deep learning is a subset of machine learning",
 "Natural language processing uses machine learning",
 "Computer vision applies deep learning techniques"
]
# TODO: Your code here
# 1. Calculate TF-IDF
# 2. Find top 2 most distinctive words per document
# 3. Which word has highest IDF? (appears in fewest documents)

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 1. Calculate TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)
vocab = tfidf.get_feature_names_out()

# 2. Top 2 words per document
print("Top 2 distinctive words per document:")
for i in range(len(documents)):
 scores = tfidf_matrix[i].toarray().flatten()
 top_indices = scores.argsort()[-2:][::-1]
 print(f"Doc {i+1}: {documents[i]}")
 for idx in top_indices:
  print(f" {vocab[idx]:15s}: {scores[idx]:.3f}")

# 3. Highest IDF word
idf_scores = tfidf.idf_
max_idf_idx = idf_scores.argmax()
print(f"\nHighest IDF word: '{vocab[max_idf_idx]}' (IDF:{idf_scores[max_idf_idx]:.3f})")
print("This word appears in the fewest documents")


Top 2 distinctive words per document:
Doc 1: Machine learning algorithms require large datasets
 algorithms     : 0.462
 require        : 0.462
Doc 2: Deep learning is a subset of machine learning
 learning       : 0.461
 subset         : 0.442
Doc 3: Natural language processing uses machine learning
 uses           : 0.462
 processing     : 0.462
Doc 4: Computer vision applies deep learning techniques
 vision         : 0.452
 techniques     : 0.452

Highest IDF word: 'algorithms' (IDF:1.916)
This word appears in the fewest documents


In [33]:
#4.1 Lowercasing and Punctuation Removal
import string
from nltk.tokenize import word_tokenize
text = "Hello World! This is NLP 101. Are you ready? Let's GO!!!"

# Lowercase
text_lower = text.lower()
print(f"Lowercased: {text_lower}")

# Remove punctuation (method 1: translate)
text_no_punct = text_lower.translate(str.maketrans('', '',
string.punctuation))
print(f"No punctuation: {text_no_punct}")

# Remove punctuation (method 2: during tokenization)
tokens = word_tokenize(text_lower)
tokens_alpha = [t for t in tokens if t.isalpha()]
print(f"Tokens (alphabetic only): {tokens_alpha}")


Lowercased: hello world! this is nlp 101. are you ready? let's go!!!
No punctuation: hello world this is nlp 101 are you ready lets go
Tokens (alphabetic only): ['hello', 'world', 'this', 'is', 'nlp', 'are', 'you', 'ready', 'let', 'go']


In [34]:
#4.2 Stop Words Removal

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
text = "This is a sample sentence demonstrating the removal of stop words"

# Tokenize
tokens = word_tokenize(text.lower())
print(f"Original tokens: {tokens}")

# Load stop words
stop_words = set(stopwords.words('english'))
print(f"\nNumber of English stop words: {len(stop_words)}")
print(f"Sample stop words: {list(stop_words)[:10]}")

# Remove stop words
filtered_tokens = [t for t in tokens if t not in stop_words]
print(f"\nFiltered tokens: {filtered_tokens}")
print(f"Removed {len(tokens) - len(filtered_tokens)} stop words")


Original tokens: ['this', 'is', 'a', 'sample', 'sentence', 'demonstrating', 'the', 'removal', 'of', 'stop', 'words']

Number of English stop words: 198
Sample stop words: ['hasn', 'now', 'are', 'its', "she'll", "wouldn't", 'ours', "they'll", 'so', 'up']

Filtered tokens: ['sample', 'sentence', 'demonstrating', 'removal', 'stop', 'words']
Removed 5 stop words


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prosi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
#4.3 Stemming (Porter Stemmer)
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "runs", "ran", "runner", "easily", "fairly",
"happiness", "happily", "happier"]
print("Word | Stem")
print("-" * 30)
for word in words:
 stem = stemmer.stem(word)
 print(f"{word:15s}| {stem}")


Word | Stem
------------------------------
running        | run
runs           | run
ran            | ran
runner         | runner
easily         | easili
fairly         | fairli
happiness      | happi
happily        | happili
happier        | happier


In [37]:
#4.4 Lemmatization (WordNet)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
words = ["running", "runs", "ran", "runner", "better", "best", "good",
"am", "is", "are"]
print("Word | Lemma (no POS) | Lemma (with POS)")
print("-" * 60)
for word in words:
 lemma_no_pos = lemmatizer.lemmatize(word)
 lemma_verb = lemmatizer.lemmatize(word, pos='v') # verb
 print(f"{word:15s}| {lemma_no_pos:15s}| {lemma_verb}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prosi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prosi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Word | Lemma (no POS) | Lemma (with POS)
------------------------------------------------------------
running        | running        | run
runs           | run            | run
ran            | ran            | run
runner         | runner         | runner
better         | better         | better
best           | best           | best
good           | good           | good
am             | am             | be
is             | is             | be
are            | are            | be


In [38]:
#4.5 Complete Preprocessing Pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
def preprocess_text(text, remove_stopwords=True, lemmatize=True):
 """
 Complete text preprocessing pipeline
 """
 
 # 1. Lowercase
 text = text.lower()

 # 2. Tokenize
 tokens = word_tokenize(text)

 # 3. Remove punctuation and non-alphabetic tokens
 tokens = [t for t in tokens if t.isalpha()]

 # 4. Remove stop words
 if remove_stopwords:
  stop_words = set(stopwords.words('english'))
 tokens = [t for t in tokens if t not in stop_words]

 # 5. Lemmatize
 if lemmatize:
  lemmatizer = WordNetLemmatizer()
 tokens = [lemmatizer.lemmatize(t) for t in tokens]

 return tokens

# Test the pipeline
text = "The children are playing in the park. They're having so much fun!"
print("Original text:")
print(text)
print("\nPreprocessed tokens:")
tokens = preprocess_text(text)
print(tokens)
print("\nReconstructed text:")
print(" ".join(tokens))

Original text:
The children are playing in the park. They're having so much fun!

Preprocessed tokens:
['child', 'playing', 'park', 'much', 'fun']

Reconstructed text:
child playing park much fun


In [40]:
# EXERCISE 4: Your Turn
review = "I absolutely LOVED this movie!!! The acting was great, but the plot was terrible. Would not recommend. 2/10"
# TODO: Your code here
# Requirements:
# 1. Lowercase
# 2. Remove punctuation and numbers
# 3. Tokenize
# 4. Keep stop words (important for sentiment!)
# 5. Lemmatize
# 6. Print before/after


from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
def preprocess_sentiment(text):
 # 1. Lowercase
 text = text.lower()

 # 2. Tokenize
 tokens = word_tokenize(text)

 # 3. Remove punctuation and numbers, keep alphabetic
 tokens = [t for t in tokens if t.isalpha()]

 # 4. NO stop word removal (negations matter for sentiment!)

 # 5. Lemmatize
 lemmatizer = WordNetLemmatizer()
 tokens = [lemmatizer.lemmatize(t) for t in tokens]

 return tokens
print("Original:")
print(review)
tokens = preprocess_sentiment(review)
print("\nProcessed tokens:")
print(tokens)
print(f"\nToken count: {len(tokens)}")



Original:
I absolutely LOVED this movie!!! The acting was great, but the plot was terrible. Would not recommend. 2/10

Processed tokens:
['i', 'absolutely', 'loved', 'this', 'movie', 'the', 'acting', 'wa', 'great', 'but', 'the', 'plot', 'wa', 'terrible', 'would', 'not', 'recommend']

Token count: 17


In [41]:
#EXERCISE 5: Stemming vs Lemmatization Comparison
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import time
text = """
The striped bats are hanging on their feet for best performance.
The studies have shown that running improves cardiovascular health.
The children were playing with better toys than before.
"""
# Tokenize
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t.isalpha()]

# Stemming
stemmer = PorterStemmer()
start = time.time()
stems = [stemmer.stem(t) for t in tokens]
stem_time = time.time() - start

# Lemmatization
lemmatizer = WordNetLemmatizer()
start = time.time()
lemmas = [lemmatizer.lemmatize(t) for t in tokens]
lemma_time = time.time() - start

# Compare results
print("Token | Stem | Lemma")
print("-" * 50)
for token, stem, lemma in zip(tokens[:15], stems[:15], lemmas[:15]):
 print(f"{token:15s}| {stem:15s}| {lemma}")
print(f"\nProcessing time:")
print(f"Stemming: {stem_time*1000:.2f}ms")
print(f"Lemmatization: {lemma_time*1000:.2f}ms")
print(f"Speed ratio: {lemma_time/stem_time:.1f}x slower")

# Vocabulary size comparison
print(f"\nVocabulary size:")
print(f"Original: {len(set(tokens))}")
print(f"After stemming: {len(set(stems))}")
print(f"After lemmatization: {len(set(lemmas))}")

Token | Stem | Lemma
--------------------------------------------------
the            | the            | the
striped        | stripe         | striped
bats           | bat            | bat
are            | are            | are
hanging        | hang           | hanging
on             | on             | on
their          | their          | their
feet           | feet           | foot
for            | for            | for
best           | best           | best
performance    | perform        | performance
the            | the            | the
studies        | studi          | study
have           | have           | have
shown          | shown          | shown

Processing time:
Stemming: 1.00ms
Lemmatization: 0.00ms
Speed ratio: 0.0x slower

Vocabulary size:
Original: 27
After stemming: 27
After lemmatization: 27


In [47]:
#6.1 Document Similarity with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning uses neural networks with many layers",
    """Natural language processing helps computers understand human
    language""",
    """Computer vision is about teaching computers to see and interpret
    images""",
    "Artificial intelligence includes machine learning and deep learning"
]

# Calculate TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)

# Display similarity matrix
print("Document Similarity Matrix:")
print("(1.0 = identical, 0.0 = no common words)\n")
for i in range(len(documents)):
 print(f"Doc {i+1}:", end=" ")
 for j in range(len(documents)):
  print(f"{similarity_matrix[i][j]:.2f}", end=" ")
 print()

# Find most similar document pairs
print("\nMost similar document pairs:")
for i in range(len(documents)):
 for j in range(i+1, len(documents)):sim = similarity_matrix[i][j]
 if sim > 0.1: # Threshold
  print(f"Doc {i+1} ↔ Doc {j+1}: {sim:.3f}")
  print(f" Doc {i+1}: {documents[i][:50]}...")

Document Similarity Matrix:
(1.0 = identical, 0.0 = no common words)

Doc 1: 1.00 0.07 0.00 0.09 0.52 
Doc 2: 0.07 1.00 0.00 0.00 0.24 
Doc 3: 0.00 0.00 1.00 0.07 0.00 
Doc 4: 0.09 0.00 0.07 1.00 0.08 
Doc 5: 0.52 0.24 0.00 0.08 1.00 

Most similar document pairs:
Doc 1 ↔ Doc 5: 0.516
 Doc 1: Machine learning is a subset of artificial intelli...
Doc 2 ↔ Doc 5: 0.236
 Doc 2: Deep learning uses neural networks with many layer...


In [49]:
#6.2 Keyword Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
document = """
Python is a high-level programming language. It is widely used for web
development,
data science, machine learning, and automation. Python's simple syntax
makes it
ideal for beginners. The Python community is large and supportive. Many
companies
use Python for their backend services and data analysis pipelines.
"""
# Extract keywords using TF-IDF
tfidf = TfidfVectorizer(max_features=10, stop_words='english')
tfidf_matrix = tfidf.fit_transform([document])

# Get feature names and scores
feature_names = tfidf.get_feature_names_out()
scores = tfidf_matrix.toarray()[0]

# Sort by score
keyword_scores = sorted(zip(feature_names, scores), key=lambda x: x[1],reverse=True)
print("Top 10 Keywords:")
for keyword, score in keyword_scores:
 print(f"{keyword:20s}: {score:.3f}")

Top 10 Keywords:
python              : 0.756
data                : 0.378
analysis            : 0.189
automation          : 0.189
backend             : 0.189
beginners           : 0.189
community           : 0.189
companies           : 0.189
high                : 0.189
ideal               : 0.189
