**CODE 1**

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

# Input text
text = "This is an example sentence demonstrating part of speech tagging."

# Tokenize and remove stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokenize(text) if word.lower() not in stop_words]

# Perform POS tagging
pos_tags = nltk.pos_tag(filtered_words)

# Output
print(pos_tags)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('example', 'NN'), ('sentence', 'NN'), ('demonstrating', 'VBG'), ('part', 'NN'), ('speech', 'NN'), ('tagging', 'NN'), ('.', '.')]


**CODE 2**

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is a sample document.",
    "This document is another sample document.",
    "And this is yet another example document."
]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Display TF-IDF values
for word, idx in vectorizer.vocabulary_.items():
    print(f"{word}: {vectorizer.idf_[idx]:.3f}")


this: 1.000
is: 1.000
sample: 1.288
document: 1.000
another: 1.288
and: 1.693
yet: 1.693
example: 1.693


**CODE 3**

In [5]:
import nltk
from collections import defaultdict, Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import ConditionalFreqDist

# Download required resources
nltk.download('punkt')

class NgramModel:
    def __init__(self, n):
        self.n = n
        self.model = defaultdict(Counter)

    def train(self, text):
        tokens = word_tokenize(text.lower())
        n_grams = ngrams(tokens, self.n)
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            self.model[prefix][next_word] += 1

    def predict(self, context):
        context = tuple(context[-(self.n - 1):])
        if context in self.model:
            return self.model[context].most_common(1)[0][0]
        return None

# Example usage
text = "This is a simple example. This example is for N-gram language modeling."
model = NgramModel(n=2)  # Bigram model
model.train(text)

# Predict next word
context = ["this", "is"]
print("Next word:", model.predict(context))


Next word: a


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**CODE 4A**


In [6]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download required resources
nltk.download('punkt')

# Sample corpus
corpus = [
    "This is a simple example.",
    "Word embeddings are useful for NLP tasks.",
    "Word2Vec captures semantic relationships between words."
]

# Preprocess: Tokenize sentences
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get vector for a word
word_vector = model.wv['word']  # Replace 'word' with any word in the vocabulary
print(f"Vector for 'word': {word_vector}")

# Find most similar words
similar_words = model.wv.most_similar('word', topn=5)  # Replace 'word' as needed
print("Most similar words:", similar_words)


Vector for 'word': [ 8.1681199e-03 -4.4430327e-03  8.9854337e-03  8.2536647e-03
 -4.4352221e-03  3.0310510e-04  4.2744912e-03 -3.9263200e-03
 -5.5599655e-03 -6.5123225e-03 -6.7073823e-04 -2.9592158e-04
  4.4630850e-03 -2.4740540e-03 -1.7260908e-04  2.4618758e-03
  4.8675989e-03 -3.0808449e-05 -6.3394094e-03 -9.2608072e-03
  2.6657581e-05  6.6618943e-03  1.4660227e-03 -8.9665223e-03
 -7.9386048e-03  6.5519023e-03 -3.7856805e-03  6.2549924e-03
 -6.6810320e-03  8.4796622e-03 -6.5163244e-03  3.2880199e-03
 -1.0569858e-03 -6.7875278e-03 -3.2875966e-03 -1.1614120e-03
 -5.4709399e-03 -1.2113475e-03 -7.5633135e-03  2.6466595e-03
  9.0701487e-03 -2.3772502e-03 -9.7651005e-04  3.5135616e-03
  8.6650876e-03 -5.9218528e-03 -6.8875779e-03 -2.9329848e-03
  9.1476962e-03  8.6626766e-04 -8.6784009e-03 -1.4469790e-03
  9.4794659e-03 -7.5494875e-03 -5.3580985e-03  9.3165627e-03
 -8.9737261e-03  3.8259076e-03  6.6544057e-04  6.6607012e-03
  8.3127534e-03 -2.8507852e-03 -3.9923131e-03  8.8979173e-03
  2.0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**CODE 4B**

In [9]:
from google.colab import files
uploaded = files.upload()

Saving W20.jpeg to W20.jpeg


In [None]:
from gensim.models import KeyedVectors

# Load GloVe pre-trained embeddings (download required .txt file first)
glove_path = 'glove.6B.100d.txt'  # Update path to GloVe file
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)

# Get vector for a word
glove_vector = glove_model['word']  # Replace 'word' with your word
print(f"GloVe vector for 'word': {glove_vector}")


**CODE 4C**

In [7]:
from gensim.models import FastText

# Train FastText model on the tokenized corpus
fasttext_model = FastText(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get vector for a word
fasttext_vector = fasttext_model.wv['word']  # Replace 'word' with your word
print(f"FastText vector for 'word': {fasttext_vector}")

# Similarity
fasttext_similar = fasttext_model.wv.most_similar('word', topn=5)
print("Most similar words (FastText):", fasttext_similar)


FastText vector for 'word': [ 2.1158298e-03  9.8664640e-04  1.2802493e-03  2.1573605e-03
  1.5429289e-04 -3.0144432e-03  2.3548654e-03  9.9286779e-05
 -1.9067357e-03 -1.4003273e-03 -7.1609742e-04 -5.6715542e-04
 -6.3540565e-04 -1.6781141e-05 -4.6855113e-03  2.0611640e-03
  3.9697862e-03 -2.0669720e-03  1.2429424e-03  1.0267305e-03
 -1.2779417e-03 -4.6194455e-04 -3.1799488e-03 -3.7299070e-04
 -1.6909014e-03  5.3718337e-04  1.3538231e-03 -9.9462760e-04
  1.7661012e-03  4.2590499e-04 -3.5518100e-03 -1.7988580e-04
 -3.8802540e-05  4.6802667e-04 -6.3560851e-04  3.3645341e-04
  1.4993497e-03  1.5862887e-03 -1.9505657e-03  1.9697289e-03
 -1.5898142e-05 -4.0358453e-04  1.6228232e-04  1.0136354e-04
 -2.7229770e-03  2.1987378e-03 -1.6541592e-03  3.2168140e-03
  1.0731084e-03  5.4620713e-04 -3.4078488e-03 -3.7224113e-03
 -4.9067027e-04  1.1654234e-03  1.7591849e-03  1.4550920e-03
 -1.3619323e-06 -3.5131461e-04  1.3028962e-03 -3.2646924e-03
  6.8984582e-04  2.8526734e-04  1.9261348e-03 -2.2931029e