Text Preprocessing
Tokenization, Stemming, Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added to resolve LookupError

text = "NLP is amazing. It helps machines understand human language."

# Sentence Tokenization
sentences = sent_tokenize(text)
print(sentences)

# Word Tokenization
words = word_tokenize(text)
print(words)

# Stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in words]
print(stems)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in words]
print(lemmas)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['NLP is amazing.', 'It helps machines understand human language.']
['NLP', 'is', 'amazing', '.', 'It', 'helps', 'machines', 'understand', 'human', 'language', '.']
['nlp', 'is', 'amaz', '.', 'it', 'help', 'machin', 'understand', 'human', 'languag', '.']
['NLP', 'is', 'amazing', '.', 'It', 'help', 'machine', 'understand', 'human', 'language', '.']


Tokenization Types
Word, Subword, Sentence

In [None]:
# Sentence & Word already shown above

# Subword tokenization (using transformers tokenizer)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Transformers are powerful")
print(tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['transformers', 'are', 'powerful']


Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "I love NLP",
    "NLP is fun",
    "I love machine learning"
]

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(bow.toarray())

['fun' 'is' 'learning' 'love' 'machine' 'nlp']
[[0 0 0 1 0 1]
 [1 1 0 0 0 1]
 [0 0 1 1 1 0]]


TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

print(tfidf.get_feature_names_out())
print(tfidf_matrix.toarray())

['fun' 'is' 'learning' 'love' 'machine' 'nlp']
[[0.         0.         0.         0.70710678 0.         0.70710678]
 [0.62276601 0.62276601 0.         0.         0.         0.4736296 ]
 [0.         0.         0.62276601 0.4736296  0.62276601 0.        ]]


Word Embeddings
Word2Vec

In [None]:
!pip install gensim
from gensim.models import Word2Vec

sentences = [
    ["i", "love", "nlp"],
    ["nlp", "is", "fun"],
    ["machine", "learning", "is", "cool"]
]

model = Word2Vec(sentences, vector_size=50, window=3, min_count=1)
print(model.wv["nlp"])

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0
[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312

GloVe (using pre-trained vectors)

In [None]:
import gensim.downloader as api

glove = api.load("glove-wiki-gigaword-50")
print(glove["nlp"])

[-0.6721    -0.17858    0.20188    0.63581   -0.31304    1.2183
 -0.13314   -1.1776    -0.27009    0.52236   -0.0086308 -0.056211
  1.3483    -1.0131    -1.0985    -0.24086   -0.0066808 -0.14822
 -0.044672   0.54472   -0.92966   -0.69065    0.91675    0.054691
 -0.2081     1.1201     0.92071   -1.2295     0.107      0.65846
 -0.84775   -0.14577   -0.69941    0.83514    0.90995   -0.70647
 -0.78513    0.82611    1.0785     0.29806    1.0306     0.19589
 -0.5562     0.43684    0.5979     0.77427    0.40238    0.57069
  0.29321    1.0723   ]


RNN, LSTM, GRU (Conceptual Code – Keras)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    LSTM(64),   # change to SimpleRNN or GRU to compare
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Transformers (Conceptual Overview – BERT)

In [None]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis")
result = nlp("I absolutely love learning NLP!")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9997356534004211}]


NLP Tasks
🔹 Sentiment Analysis

In [None]:
from transformers import pipeline
sentiment = pipeline("sentiment-analysis")
print(sentiment("This course is amazing"))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998831748962402}]


Text Classification

In [None]:
classifier = pipeline("text-classification")
print(classifier("Python is the best programming language"))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9997871518135071}]


Named Entity Recognition (NER)

In [None]:
ner = pipeline("ner", grouped_entities=True)
print(ner("Deepali works at Codegnan in India"))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'entity_group': 'PER', 'score': np.float32(0.9989197), 'word': 'Deepali', 'start': 0, 'end': 7}, {'entity_group': 'ORG', 'score': np.float32(0.99084586), 'word': 'Codegnan', 'start': 17, 'end': 25}, {'entity_group': 'LOC', 'score': np.float32(0.9995914), 'word': 'India', 'start': 29, 'end': 34}]


Text Summarization

In [None]:
summarizer = pipeline("summarization")

text = """
Natural Language Processing enables machines to understand human language.
It is widely used in chatbots, translation, and sentiment analysis.
"""

summary = summarizer(text, max_length=40, min_length=20)
print(summary)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 40, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


[{'summary_text': ' Natural Language Processing enables machines to understand human language . It is widely used in chatbots, translation, and sentiment analysis .'}]
