In [None]:
text = "I am Raghava\ntext"
tokens = text.split()   # splits on any whitespace
print(tokens)


['I', 'am', 'Raghava', 'text']


In [None]:
text = "Raghava is lazy"
tokens = list(text)
print(tokens)


['R', 'a', 'g', 'h', 'a', 'v', 'a', ' ', 'i', 's', ' ', 'l', 'a', 'z', 'y']


In [None]:
text = "Tokenization with stop word removal is very useful in NLP tasks"

stop_words = {"is", "with", "in", "the", "and", "a", "an"}

tokens = text.lower().split()
filtered_tokens = [w for w in tokens if w not in stop_words]

print(filtered_tokens)


['tokenization', 'stop', 'word', 'removal', 'very', 'useful', 'nlp', 'tasks']


In [None]:
text = "hey google!.This is Raghava.I am a arrogant boy."

sentences = text.split(".")


sentences = [s.strip() for s in sentences if s.strip()]

print("Sentences:")
for i, s in enumerate(sentences, 1):
    print(f"{i}. {s}")

print("\nWord Tokens in Each Sentence:")
for i, s in enumerate(sentences, 1):
    words = s.split()
    print(f"Sentence {i} words:", words)

Sentences:
1. hey google!
2. This is Raghava
3. I am a arrogant boy

Word Tokens in Each Sentence:
Sentence 1 words: ['hey', 'google!']
Sentence 2 words: ['This', 'is', 'Raghava']
Sentence 3 words: ['I', 'am', 'a', 'arrogant', 'boy']


In [None]:
text = "I am Raghava.\nThis is word tokenisation!"

# Tokenise on whitespace
tokens = text.split()

print(tokens)


['I', 'am', 'Raghava.', 'This', 'is', 'word', 'tokenisation!']


In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

# Given text
text = "Raghava's favourite player is Kohli"

# Word tokenization
words = word_tokenize(text)

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if word.lower() not in stop_words]

print("Original Tokens:")
print(words)

print("\nAfter Stop Word Removal:")
print(filtered_words)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Original Tokens:
['Raghava', "'s", 'favourite', 'player', 'is', 'Kohli']

After Stop Word Removal:
['Raghava', "'s", 'favourite', 'player', 'Kohli']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "Raghava likes cricket",
    "Kohli plays cricket",
    "Raghava likes Kohli"
]

# Create Bag of Words model
vectorizer = CountVectorizer()

# Fit and transform the documents
bow_matrix = vectorizer.fit_transform(documents)

# Convert to array
bow_array = bow_matrix.toarray()

# Get feature names (vocabulary)
features = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(features)

print("\nBag of Words Matrix:")
print(bow_array)


Vocabulary:
['cricket' 'kohli' 'likes' 'plays' 'raghava']

Bag of Words Matrix:
[[1 0 1 0 1]
 [1 1 0 1 0]
 [0 1 1 0 1]]


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from gensim.models import Word2Vec

# Sample sentences (tokenized)
sentences = [
    ["raghava", "likes", "cricket"],
    ["kohli", "plays", "cricket"],
    ["raghava", "likes", "kohli"]
]

# Train Word2Vec model
model = Word2Vec(
    sentences,
    vector_size=100,   # size of word vectors
    window=5,          # context window
    min_count=1,       # include all words
    sg=0               # 0 = CBOW, 1 = Skip-gram
)

# Get vector for a word
vector = model.wv["raghava"]

print("Word Vector for 'raghava':")
print(vector)

# Similar words
print("\nWords similar to 'raghava':")
print(model.wv.most_similar("raghava"))


Word Vector for 'raghava':
[-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-

In [None]:
import gensim.downloader as api

# Load pre-trained GloVe model (100-dimensional)
glove_model = api.load("glove-wiki-gigaword-100")

# Get vector for a word
vector = glove_model["cricket"]

print("Vector for 'cricket':")
print(vector)

# Find similar words
print("\nSimilar words to 'cricket':")
print(glove_model.most_similar("cricket"))


Vector for 'cricket':
[-0.55541    0.45894    0.51851   -0.045938  -1.4064     0.49701
 -0.085008   0.63442   -1.7949    -0.31881   -0.13673   -1.1583
  0.45505    0.21464   -0.21751   -0.21984    0.60619    0.55812
 -0.01031    0.66228    0.22206    0.25498    0.8452    -0.72988
  0.26195    0.26418    0.22577   -0.051338   0.024459   0.86389
 -0.35585    0.48662   -0.49752   -0.44777   -0.040533  -0.18376
 -1.32       0.54899   -1.2289    -0.22673   -0.93431    0.78923
  0.9565    -1.3996     1.0314     0.39573    0.7956    -0.27184
  0.51776   -1.0387    -0.38121    0.21772    0.52486    0.63307
 -0.21206   -1.6741    -1.3811     0.079469   0.46871    0.29956
 -0.90023   -0.16781   -0.30873    0.16586    0.12141    0.50219
  0.049859   0.54896    0.55576   -0.14683    0.55657   -0.0060587
  0.25941   -0.91918    0.23      -0.32992    0.18277    0.036235
 -0.71589    0.22084    0.3952    -0.46155    1.0515    -0.58014
 -0.19766   -0.39474   -1.2366    -0.37599    0.070743   0.93192
 

In [None]:
!pip install transformers torch




In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Input text
text = "Raghava likes cricket"

# Tokenize input
inputs = tokenizer(text, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Last hidden state (word embeddings)
last_hidden_states = outputs.last_hidden_state

print("BERT Embedding Shape:")
print(last_hidden_states.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embedding Shape:
torch.Size([1, 7, 768])
