<a href="https://colab.research.google.com/github/PavanGavit/NLP_LAB/blob/main/NLP_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install scikit-learn gensim pandas nltk

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [4]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# --- THE FIX ---
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')
    nltk.download('punkt')
# ---------------

# Sample Corpus
corpus = [
    "Data science is amazing and fun",
    "Science involves data and experiments",
    "Fun experiments lead to amazing data"
]

# This line should now work without error
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

print("Corpus Loaded and Tokenized Successfully!")
print(tokenized_corpus[0])



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Corpus Loaded and Tokenized Successfully!
['data', 'science', 'is', 'amazing', 'and', 'fun']


In [6]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from gensim.models import Word2Vec

# --- 1. SETUP & FIXES ---
# Download necessary NLTK data (fixing the LookupError)
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')
    nltk.download('punkt')

# Sample Corpus
corpus = [
    "Data science is amazing and fun",
    "Science involves data and experiments",
    "Fun experiments lead to amazing data"
]

# Preprocessing: Tokenize for Word2Vec
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
print("--- Data Loaded ---")

# --- 2. BAG OF WORDS (COUNT) ---
# We must define 'bow_matrix' here so it exists for the next steps
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(corpus)

bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)
print("\n--- Bag-of-Words (Count) ---")
print(bow_df)

# --- 3. BAG OF WORDS (NORMALIZED) ---
# Now 'bow_matrix' is defined, so this will work
tf_transformer = TfidfTransformer(use_idf=False, norm='l1')
normalized_matrix = tf_transformer.fit_transform(bow_matrix)

normalized_df = pd.DataFrame(
    normalized_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)
print("\n--- Normalized Counts (TF) ---")
print(normalized_df.round(2))

# --- 4. TF-IDF ---
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)
print("\n--- TF-IDF ---")
print(tfidf_df.round(2))

# --- 5. WORD2VEC EMBEDDINGS ---
# Training the model
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=2, min_count=1, workers=4)

print("\n--- Word2Vec Embedding for 'science' ---")
print(model.wv['science'])

--- Data Loaded ---

--- Bag-of-Words (Count) ---
   amazing  and  data  experiments  fun  involves  is  lead  science  to
0        1    1     1            0    1         0   1     0        1   0
1        0    1     1            1    0         1   0     0        1   0
2        1    0     1            1    1         0   0     1        0   1

--- Normalized Counts (TF) ---
   amazing   and  data  experiments   fun  involves    is  lead  science    to
0     0.17  0.17  0.17         0.00  0.17       0.0  0.17  0.00     0.17  0.00
1     0.00  0.20  0.20         0.20  0.00       0.2  0.00  0.00     0.20  0.00
2     0.17  0.00  0.17         0.17  0.17       0.0  0.00  0.17     0.00  0.17

--- TF-IDF ---
   amazing   and  data  experiments   fun  involves    is  lead  science    to
0     0.40  0.40  0.31         0.00  0.40      0.00  0.52  0.00     0.40  0.00
1     0.00  0.43  0.34         0.43  0.00      0.57  0.00  0.00     0.43  0.00
2     0.38  0.00  0.29         0.38  0.38      0.00  0.00