## Problem 1: Tokenization and Stemming/Lemmatization

In [1]:

import nltk
from nltk.tokenize import word_tokenize, WhitespaceTokenizer, TweetTokenizer, TreebankWordTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

text = "The quick brown fox, couldn't jump over the lazy dog! #animals"

# Tokenizers
print("Whitespace:", WhitespaceTokenizer().tokenize(text))
print("Punctuation:", word_tokenize(text))
print("Treebank:", TreebankWordTokenizer().tokenize(text))
print("Tweet:", TweetTokenizer().tokenize(text))

mwe = MWETokenizer([("lazy", "dog")])
print("MWE:", mwe.tokenize(text.split()))

# Stemming
porter = PorterStemmer()
snowball = SnowballStemmer("english")
print("Porter Stemmer:", [porter.stem(w) for w in word_tokenize(text)])
print("Snowball Stemmer:", [snowball.stem(w) for w in word_tokenize(text)])

# Lemmatization
lemmatizer = WordNetLemmatizer()
print("Lemmatized:", [lemmatizer.lemmatize(w) for w in word_tokenize(text)])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Whitespace: ['The', 'quick', 'brown', 'fox,', "couldn't", 'jump', 'over', 'the', 'lazy', 'dog!', '#animals']
Punctuation: ['The', 'quick', 'brown', 'fox', ',', 'could', "n't", 'jump', 'over', 'the', 'lazy', 'dog', '!', '#', 'animals']
Treebank: ['The', 'quick', 'brown', 'fox', ',', 'could', "n't", 'jump', 'over', 'the', 'lazy', 'dog', '!', '#', 'animals']
Tweet: ['The', 'quick', 'brown', 'fox', ',', "couldn't", 'jump', 'over', 'the', 'lazy', 'dog', '!', '#animals']
MWE: ['The', 'quick', 'brown', 'fox,', "couldn't", 'jump', 'over', 'the', 'lazy', 'dog!', '#animals']
Porter Stemmer: ['the', 'quick', 'brown', 'fox', ',', 'could', "n't", 'jump', 'over', 'the', 'lazi', 'dog', '!', '#', 'anim']
Snowball Stemmer: ['the', 'quick', 'brown', 'fox', ',', 'could', "n't", 'jump', 'over', 'the', 'lazi', 'dog', '!', '#', 'anim']
Lemmatized: ['The', 'quick', 'brown', 'fox', ',', 'could', "n't", 'jump', 'over', 'the', 'lazy', 'dog', '!', '#', 'animal']


## Problem 2: BoW, TF-IDF, Word2Vec

In [4]:

! pip install gensim
! pip install scikit-learn
! pip install nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

corpus = [
    "the quick brown fox",
    "the slow brown dog",
    "the quick red dog",
    "the lazy yellow fox"
]

# Count Vectorizer
cv = CountVectorizer()
print("BoW Count:", cv.fit_transform(corpus).toarray())

# Normalized BoW
tf = TfidfVectorizer(use_idf=False, norm='l2')
print("Normalized Count:", tf.fit_transform(corpus).toarray())

# TF-IDF
tfidf = TfidfVectorizer()
print("TF-IDF:", tfidf.fit_transform(corpus).toarray())

# Word2Vec
tokens = [sentence.split() for sentence in corpus]
model = Word2Vec(sentences=tokens, vector_size=50, window=2, min_count=1)
print("Word2Vec vector for 'dog':", model.wv['dog'])


Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Using cached gensim-4.3.3.tar.gz (23.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4.tar.gz (15.8 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'error'


  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [21 lines of output]
      + C:\Program Files\Python313\python.exe C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5f8eaff5afa\vendored-meson\meson\meson.py setup C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5f8eaff5afa C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5f8eaff5afa\.mesonpy-vixc2fgm -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5f8eaff5afa\.mesonpy-vixc2fgm\meson-python-native-file.ini
      The Meson build system
      Version: 1.2.99
      Source dir: C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5f8eaff5afa
      Build dir: C:\Users\DELL\AppData\Local\Temp\pip-install-_1x6jz2c\numpy_a7c2d612f4424ea89b3ac5

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'gensim'

## Problem 3: Cleaning, Lemmatization, Stopword Removal, Label Encoding, TF-IDF

In [6]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

data = ["Cats are running!", "Dogs barked loudly.", "Birds are flying."]
labels = ['animal', 'animal', 'animal']

cleaned = []
lemmatizer = WordNetLemmatizer()

for sent in data:
    sent = re.sub(r'[^a-zA-Z ]', '', sent)
    tokens = word_tokenize(sent.lower())
    filtered = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    cleaned.append(" ".join(filtered))

# Label encoding
le = LabelEncoder()
y = le.fit_transform(labels)

# TF-IDF
vec = TfidfVectorizer()
X = vec.fit_transform(cleaned)

print("Encoded Labels:", y)
print("TF-IDF Matrix:", X.toarray())


Encoded Labels: [0 0 0]
TF-IDF Matrix: [[0.         0.         0.70710678 0.         0.         0.
  0.70710678]
 [0.57735027 0.         0.         0.57735027 0.         0.57735027
  0.        ]
 [0.         0.70710678 0.         0.         0.70710678 0.
  0.        ]]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Problem 4: Transformer from Scratch (Simplified in PyTorch)

In [7]:

import torch
import torch.nn as nn
import torch.nn.functional as F

class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=2):
        super(MiniTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, batch_first=True)
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        attn_output, _ = self.attention(x, x, x)
        return self.linear(attn_output)

# Dummy input
model = MiniTransformer(vocab_size=100)
dummy_input = torch.randint(0, 100, (2, 5))
print(model(dummy_input).shape)


torch.Size([2, 5, 100])


## Problem 5: Shallow Parsing and Regex Parsing

In [8]:

nltk.download('averaged_perceptron_tagger')

sentence = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(sentence)
pos_tags = nltk.pos_tag(tokens)

# Shallow parsing (chunking)
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(chunk_grammar)
chunk_tree = chunk_parser.parse(pos_tags)
print(chunk_tree)

# Draw if needed: chunk_tree.draw()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\DELL/nltk_data'
    - 'c:\\Program Files\\Python313\\nltk_data'
    - 'c:\\Program Files\\Python313\\share\\nltk_data'
    - 'c:\\Program Files\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Problem 6B: Named Entity Recognition

In [9]:

text = '''Deepak Jasani, Head of retail research, HDFC Securities, said: “Investors will look
to the European Central Bank later Thursday for reassurance that surging prices are
just transitory, and not about to spiral out of control.'''

nltk.download('maxent_ne_chunker')
nltk.download('words')

tokens = word_tokenize(text)
tags = nltk.pos_tag(tokens)
tree = nltk.ne_chunk(tags)

for subtree in tree:
    if isinstance(subtree, nltk.Tree):
        print("Entity:", " ".join([token for token, pos in subtree.leaves()]), "| Type:", subtree.label())


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\DELL/nltk_data'
    - 'c:\\Program Files\\Python313\\nltk_data'
    - 'c:\\Program Files\\Python313\\share\\nltk_data'
    - 'c:\\Program Files\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\DELL\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Problem 7A: NMF for Topic Modeling

In [10]:

from sklearn.decomposition import NMF

corpus = [
    "Data science is an interdisciplinary field.",
    "Machine learning is a subset of AI.",
    "AI and ML are popular topics.",
    "Deep learning is a type of ML."
]

vec = TfidfVectorizer()
X = vec.fit_transform(corpus)

nmf = NMF(n_components=2, random_state=1)
W = nmf.fit_transform(X)
H = nmf.components_
print("Reconstruction error:", nmf.reconstruction_err_)


Reconstruction error: 1.2403721477865943


## Problem 7B: WordNet for Word Sense Disambiguation

In [11]:

from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
nltk.download('omw-1.4')

sentence = "I went to the bank to deposit money"
tokens = word_tokenize(sentence)
sense = lesk(tokens, 'bank')
print("Best sense:", sense)
print("Definition:", sense.definition())


Best sense: Synset('savings_bank.n.02')
Definition: a container (usually with a slot in the top) for keeping money at home


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Problem 8: Word Embedding with FastText and GloVe (Gensim)

In [12]:

from gensim.models import FastText, KeyedVectors

# Sample data
corpus = [["covid", "pandemic", "virus"], ["mask", "vaccine", "covid"], ["social", "distance"]]

# FastText
ft_model = FastText(sentences=corpus, vector_size=10, window=3, min_count=1)
ft_model.save("fasttext.model")
print("FastText vector for 'covid':", ft_model.wv['covid'])


ModuleNotFoundError: No module named 'gensim'

## Problem 9A: N-Gram Model

In [13]:

from nltk import ngrams

sentence = "The quick brown fox jumps"
tokens = word_tokenize(sentence)
print("Bigrams:", list(ngrams(tokens, 2)))


Bigrams: [('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps')]


## Problem 9B: LDA and LSA Topic Modeling

In [14]:

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

corpus = [
    'The cat sat on the mat.',
    'Dogs are great pets.',
    'I love to play football.',
    'Data science is an interdisciplinary field.',
    'Python is a great programming language.',
    'Machine learning is a subset of artificial intelligence.',
    'Artificial intelligence and machine learning are popular topics.',
    'Deep learning is a type of machine learning.',
    'Natural language processing involves analyzing text data.',
    'I enjoy hiking and outdoor activities.'
]

vec = CountVectorizer()
X = vec.fit_transform(corpus)

lda = LatentDirichletAllocation(n_components=2)
lda.fit(X)

lsa = TruncatedSVD(n_components=2)
lsa.fit(X)
print("LDA Components:
", lda.components_)
print("LSA Components:
", lsa.components_)


SyntaxError: unterminated string literal (detected at line 24) (4114221722.py, line 24)

## Problem 10: Fine-Tune Pretrained Transformer (Text Classification)

In [15]:

# You can run this only if transformers library is installed
# !pip install transformers datasets

# from transformers import pipeline
# classifier = pipeline("sentiment-analysis")
# print(classifier("I love machine learning!"))
print("Transformer fine-tuning requires GPU or Google Colab. This is a placeholder.")


Transformer fine-tuning requires GPU or Google Colab. This is a placeholder.
