In [11]:
#SETUP & INSTALLATION

import nltk
import spacy
import sklearn
import pandas as pd
import numpy as np

print(f"NLTK version: {nltk.__version__}")
print(f"spaCy version: {spacy.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

# Test spaCy model
nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, world!")
print(f"spaCy model loaded: {nlp.meta['name']}")


NLTK version: 3.9.2
spaCy version: 3.8.11
scikit-learn version: 1.8.0
spaCy model loaded: core_web_sm


In [12]:
#Ex1
#1.1

import nltk
# Download required NLTK data (run once)
nltk.data.path.append(r"C:\Users\prosi\AppData\Roaming\nltk_data")

from nltk.tokenize import sent_tokenize


# nltk.download('punkt')

# Sample text
text = """Natural Language Processing is fascinating. It enables computers
to understand human language.
Dr. Smith works at N.A.S.A. on text analysis. He said, "NLP is the
future!"
What do you think? Visit www.nlp.org for more info."""

# Tokenize into sentences
sentences = sent_tokenize(text)
print(f"Number of sentences: {len(sentences)}\n")
for i, sent in enumerate(sentences, 1):
 print(f"Sentence {i}: {sent}")

Number of sentences: 7

Sentence 1: Natural Language Processing is fascinating.
Sentence 2: It enables computers
to understand human language.
Sentence 3: Dr. Smith works at N.A.S.A.
Sentence 4: on text analysis.
Sentence 5: He said, "NLP is the
future!"
Sentence 6: What do you think?
Sentence 7: Visit www.nlp.org for more info.


In [13]:
#1.2
from nltk.tokenize import word_tokenize
sentence = "Don't forget: pre-processing costs $100-$200! Email john@example.com"
# Tokenize into words
tokens = word_tokenize(sentence)
print(f"Number of tokens: {len(tokens)}\n")
print("Tokens:", tokens)


Number of tokens: 15

Tokens: ['Do', "n't", 'forget', ':', 'pre-processing', 'costs', '$', '100-', '$', '200', '!', 'Email', 'john', '@', 'example.com']


In [14]:
#1.3
from nltk.tokenize import word_tokenize, wordpunct_tokenize,TreebankWordTokenizer
text = "We're analyzing BERT's performance on GPT-3.5. Wow!"
# Different tokenizers
standard = word_tokenize(text)
wordpunct = wordpunct_tokenize(text)
treebank = TreebankWordTokenizer().tokenize(text)
print("Standard word_tokenize:")
print(standard)
print(f"\nWordPunct tokenizer:")
print(wordpunct)
print(f"\nTreebank tokenizer:")
print(treebank)


Standard word_tokenize:
['We', "'re", 'analyzing', 'BERT', "'s", 'performance', 'on', 'GPT-3.5', '.', 'Wow', '!']

WordPunct tokenizer:
['We', "'", 're', 'analyzing', 'BERT', "'", 's', 'performance', 'on', 'GPT', '-', '3', '.', '5', '.', 'Wow', '!']

Treebank tokenizer:
['We', "'re", 'analyzing', 'BERT', "'s", 'performance', 'on', 'GPT-3.5.', 'Wow', '!']


In [15]:
#1.4
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Apple Inc. is looking at buying U.K. startup for $1 billion. CEO Tim Cook confirmed it."
doc = nlp(text)
print("Tokens with POS tags and lemmas:\n")
for token in doc:
 print(f"{token.text:15} | POS: {token.pos_:8} | Lemma:{token.lemma_:15} | Is_alpha: {token.is_alpha}")

Tokens with POS tags and lemmas:

Apple           | POS: PROPN    | Lemma:Apple           | Is_alpha: True
Inc.            | POS: PROPN    | Lemma:Inc.            | Is_alpha: False
is              | POS: AUX      | Lemma:be              | Is_alpha: True
looking         | POS: VERB     | Lemma:look            | Is_alpha: True
at              | POS: ADP      | Lemma:at              | Is_alpha: True
buying          | POS: VERB     | Lemma:buy             | Is_alpha: True
U.K.            | POS: PROPN    | Lemma:U.K.            | Is_alpha: False
startup         | POS: VERB     | Lemma:startup         | Is_alpha: True
for             | POS: ADP      | Lemma:for             | Is_alpha: True
$               | POS: SYM      | Lemma:$               | Is_alpha: False
1               | POS: NUM      | Lemma:1               | Is_alpha: False
billion         | POS: NUM      | Lemma:billion         | Is_alpha: True
.               | POS: PUNCT    | Lemma:.               | Is_alpha: False
CEO         

In [None]:
#EXERCISE 1: Your Turn ?????????
text = """
The COVID-19 pandemic started in 2019-2020. Dr. Johnson said, "We're
making progress!"
The vaccine costs €50-€100 in the E.U. Visit https://who.int for updates.
"""

from nltk.tokenize import sent_tokenize, word_tokenize
# 1. Sentence count
sentences = sent_tokenize(text)
print(f"Sentences: {len(sentences)}")
# 2. Word tokens
tokens = word_tokenize(text)
print(f"Total tokens: {len(tokens)}")
# 3. Unique tokens
unique_tokens = set(tokens)
print(f"Unique tokens (types): {len(unique_tokens)}")
# 4. Check URL
print(f"\nURL handling:")
print([t for t in tokens if 'http' in t.lower()])

Sentences: 4
Total tokens: 33
Unique tokens (types): 29

URL handling:
['https']


In [19]:
#EXERCISE 2: Bag-of-Words (BoW)
#2.1 Manual BoW Construction

from collections import Counter
import pandas as pd

# Sample documents
documents = [
 "The cat sat on the mat",
 "The dog sat on the log",
 "Cats and dogs are enemies"
]

# Tokenize all documents
all_tokens = []
for doc in documents:
 tokens = word_tokenize(doc.lower())
 all_tokens.append(tokens)
print("Tokenized documents:")
for i, tokens in enumerate(all_tokens):
 print(f"Doc {i+1}: {tokens}")

# Build vocabulary
vocabulary = sorted(set([token for doc in all_tokens for token in doc]))
print(f"\nVocabulary (|V| = {len(vocabulary)}):")
print(vocabulary)

# Create BoW matrix
bow_matrix = []
for tokens in all_tokens:
 counts = Counter(tokens)
 bow_vector = [counts.get(word, 0) for word in vocabulary]
 bow_matrix.append(bow_vector)

# Display as DataFrame
df = pd.DataFrame(bow_matrix, columns=vocabulary, index=[f"Doc {i+1}" for
i in range(len(documents))])
print("\nBag-of-Words Matrix:")
print(df)

Tokenized documents:
Doc 1: ['the', 'cat', 'sat', 'on', 'the', 'mat']
Doc 2: ['the', 'dog', 'sat', 'on', 'the', 'log']
Doc 3: ['cats', 'and', 'dogs', 'are', 'enemies']

Vocabulary (|V| = 12):
['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'enemies', 'log', 'mat', 'on', 'sat', 'the']

Bag-of-Words Matrix:
       and  are  cat  cats  dog  dogs  enemies  log  mat  on  sat  the
Doc 1    0    0    1     0    0     0        0    0    1   1    1    2
Doc 2    0    0    0     0    1     0        0    1    0   1    1    2
Doc 3    1    1    0     1    0     1        1    0    0   0    0    0
