# **EXP 1 – Brown & Penn Treebank Corpus**

In [1]:
import nltk
from nltk.corpus import brown, treebank
nltk.download('brown')
nltk.download('treebank')

print("Brown Categories:", brown.categories())
print("Brown Sample:", brown.words(categories='news')[:20])
print("Penn Treebank Sample:", treebank.words()[:20])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Brown Categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
Brown Sample: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']
Penn Treebank Sample: ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken']


# **EXP 2 – Sentence & Word Segmentation**

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coding. NLP is amazing.")

print("Sentences:")
for s in doc.sents:
    print(s)

Sentences:
I love coding.
NLP is amazing.


In [5]:
from nltk.tokenize import word_tokenize, RegexpTokenizer

text = "Hi! Let's test segmentation."
print("NLTK Word Tokenize:", word_tokenize(text))
print("Regex Tokenize:", RegexpTokenizer(r'\s+', gaps=True).tokenize("I Love Python"))

NLTK Word Tokenize: ['Hi', '!', 'Let', "'s", 'test', 'segmentation', '.']
Regex Tokenize: ['I', 'Love', 'Python']


# **EXP 3 – Tokenization Techniques**

In [3]:
import nltk
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer
nltk.download('punkt')

text = "Hello World! Let's test tokenizers."
print("Treebank:", TreebankWordTokenizer().tokenize(text))
print("wordpunct:", wordpunct_tokenize(text))
print("Sentences:", sent_tokenize(text))
print("Whitespace:", WhitespaceTokenizer().tokenize(text))

Treebank: ['Hello', 'World', '!', 'Let', "'s", 'test', 'tokenizers', '.']
wordpunct: ['Hello', 'World', '!', 'Let', "'", 's', 'test', 'tokenizers', '.']
Sentences: ['Hello World!', "Let's test tokenizers."]
Whitespace: ['Hello', 'World!', "Let's", 'test', 'tokenizers.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **EXP 4 – Lemmatization & Stemming**

In [15]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
nltk.download('wordnet')

words = ["running", "flies", "wolves"]
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer("english")
lm = WordNetLemmatizer()

print("Word -> Porter | Lancaster | Snowball | Lemmatizer")
for w in words:
    print(w, "->", ps.stem(w), ls.stem(w), ss.stem(w), lm.lemmatize(w))

Word -> Porter | Lancaster | Snowball | Lemmatizer
running -> run run run running
flies -> fli fli fli fly
wolves -> wolv wolv wolv wolf


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **EXP 5 – Text Normalization & N-Grams**

In [5]:
import nltk, re, contractions
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

text = "I'm learning NLP!!! It's fun, isn't it?"
text = contractions.fix(text)
clean = re.sub(r'[^a-zA-Z\s]', '', text).lower()
tokens = word_tokenize(clean)

print("Tokens:", tokens)
print("Unigrams:", list(ngrams(tokens,1)))
print("Bigrams:", list(ngrams(tokens,2)))
print("Trigrams:", list(ngrams(tokens,3)))

Tokens: ['i', 'am', 'learning', 'nlp', 'it', 'is', 'fun', 'is', 'not', 'it']
Unigrams: [('i',), ('am',), ('learning',), ('nlp',), ('it',), ('is',), ('fun',), ('is',), ('not',), ('it',)]
Bigrams: [('i', 'am'), ('am', 'learning'), ('learning', 'nlp'), ('nlp', 'it'), ('it', 'is'), ('is', 'fun'), ('fun', 'is'), ('is', 'not'), ('not', 'it')]
Trigrams: [('i', 'am', 'learning'), ('am', 'learning', 'nlp'), ('learning', 'nlp', 'it'), ('nlp', 'it', 'is'), ('it', 'is', 'fun'), ('is', 'fun', 'is'), ('fun', 'is', 'not'), ('is', 'not', 'it')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **EXP 6 – POS Tagging**

In [6]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog."
print(nltk.pos_tag(word_tokenize(text)))

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# **EXP 7 – Named Entity Recognition**

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Barack Obama was born in Hawaii.")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)

Barack Obama -> PERSON
Hawaii -> GPE


# **EXP 8 – Dependency Parsing & Chunking**

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.chunk import RegexpParser
nltk.download('punkt')

text = "The quick brown fox jumps over the lazy dog."
tokens = pos_tag(word_tokenize(text))

grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = RegexpParser(grammar)
print(cp.parse(tokens))

(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN)
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Th3\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("The young developer solved the issue quickly.")
for t in doc:
    print(t.text, "->", t.dep_, "->", t.head.text)

The -> det -> developer
young -> amod -> developer
developer -> nsubj -> solved
solved -> ROOT -> solved
the -> det -> issue
issue -> dobj -> solved
quickly -> advmod -> solved
. -> punct -> solved


# **EXP 9 – Word Embeddings (Word2Vec & BERT)**

In [10]:
from gensim.models import Word2Vec
model = Word2Vec([["this","is","word2vec","test"]], vector_size=20, min_count=1)
print(model.wv["word2vec"][:10])

[ 0.03655883  0.02535131  0.03378846  0.00381433  0.03175445 -0.01702683
 -0.00473201  0.02884287 -0.03760819 -0.01968052]


In [2]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

text = "I love NLP."
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

cls_embedding = outputs.last_hidden_state[0][0]

print("BERT CLS Embedding (first 10 values):")
print(cls_embedding[:10])


BERT CLS Embedding (first 10 values):
tensor([-0.0039,  0.3164,  0.0708, -0.3312, -0.6144, -0.4645,  0.1999,  0.8702,
         0.0341, -0.3119])


# **EXP 10 – Sentiment Analysis & Fake News Detection**

In [12]:
from textblob import TextBlob

text = "I really love this NLP practical!"
blob = TextBlob(text)

print("Text:", text)
print("Sentiment Polarity:", blob.sentiment.polarity)
print("Sentiment Subjectivity:", blob.sentiment.subjectivity)

Text: I really love this NLP practical!
Sentiment Polarity: 0.625
Sentiment Subjectivity: 0.6


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

texts = ["Fake news spreading!", "Government released report"]
labels = [1, 0]

X = TfidfVectorizer().fit_transform(texts)
clf = LogisticRegression().fit(X, labels)
print(clf.predict(X))

[1 0]


# **EXP 11 – Fine-Tuning HuggingFace Model**

In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

ds = load_dataset("imdb", split="train[:1%]").train_test_split(0.2)
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def encode(e): 
    return tok(e["text"], truncation=True, padding="max_length")

ds = ds.map(encode)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

trainer = Trainer(
    model=model,
    args=TrainingArguments("out", per_device_train_batch_size=4, num_train_epochs=1)
)

print("Model ready. Run trainer.train() to fine-tune.")

Map: 100%|██████████| 200/200 [00:00<00:00, 1057.11 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 871.54 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model ready. Run trainer.train() to fine-tune.
