In [4]:
# Impport Important libraries
!pip install gensim
import nltk
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import gensim
from gensim.models import Word2Vec, FastText

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')



[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Load dataset
documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
labels = [fileid.split('/')[0] for fileid in movie_reviews.fileids()]

In [11]:
df = pd.DataFrame({'document': documents, 'label': labels})

In [12]:
df.head()

Unnamed: 0,document,label
0,"plot : two teen couples go to a church party ,...",neg
1,the happy bastard's quick movie review \ndamn ...,neg
2,it is movies like these that make a jaded movi...,neg
3,""" quest for camelot "" is warner bros . ' firs...",neg
4,synopsis : a mentally unstable man undergoing ...,neg


In [13]:
df.describe()

Unnamed: 0,document,label
count,2000,2000
unique,2000,2
top,"plot : two teen couples go to a church party ,...",neg
freq,1,1000


In [5]:
# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 2]
    return ' '.join(tokens)

clean_docs = [preprocess(doc) for doc in documents]

In [6]:
# === 2. Bag of Words (BoW) ===
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)
bow_matrix = vectorizer.fit_transform(clean_docs)

print("BoW Matrix Shape:", bow_matrix.shape)

BoW Matrix Shape: (2000, 1000)


In [14]:
# === 3. TF-IDF ===
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_docs)

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (2000, 1000)


In [15]:
# === 4. Word2Vec ===
tokenized_docs = [doc.split() for doc in clean_docs]

w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4)
w2v_model.save("word2vec.model")

# Example usage
print("Word2Vec - Most similar to 'good':")
print(w2v_model.wv.most_similar('good'))

Word2Vec - Most similar to 'good':
[('bad', 0.9700337052345276), ('great', 0.9393161535263062), ('funny', 0.9289847016334534), ('well', 0.9097281098365784), ('isnt', 0.904741644859314), ('pretty', 0.9004055261611938), ('gotcha', 0.9001251459121704), ('acting', 0.8968760967254639), ('darned', 0.892906665802002), ('nice', 0.8915361166000366)]


In [18]:
# Importing Data
from google.colab import files
uploaded = files.upload()


Saving glove.6B.100d.txt to glove.6B.100d.txt


In [19]:
# === 5. GloVe (Pretrained) ===
# Make sure you've downloaded `glove.6B.100d.txt` from https://nlp.stanford.edu/projects/glove/ or from Kaggle

glove_model = {}
glove_path = 'glove.6B.100d.txt'  # <-- Change this to your actual path

with open(glove_path, 'r', encoding='utf8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vec = np.array(parts[1:], dtype='float32')
        glove_model[word] = vec

print("GloVe - Vectors Loaded:", len(glove_model))
print("GloVe - Vector for 'film':", glove_model.get('film'))

GloVe - Vectors Loaded: 400000
GloVe - Vector for 'film': [ 0.19916  -0.049702  0.24579  -0.32281   0.89768  -0.1278   -0.49506
  0.20814  -0.20046  -0.20604   0.038292 -0.67277  -0.12689  -0.18766
 -0.10277   0.73128   0.82408   0.087288  0.69255   1.3107    0.49113
 -0.38097   0.24338  -0.27813   0.62506   0.35978   0.42041  -0.24529
  0.14861  -0.26726  -0.56262   0.63843  -0.54153   0.36537   0.20545
 -0.16604   0.72434   0.29961  -0.42501  -0.35932  -0.089288  0.48752
 -1.0927    0.88818   0.89941  -0.7541   -0.35492  -0.76396   0.27468
  0.2757   -0.48152  -0.41399   0.64489   1.148    -0.29131  -2.9387
 -0.83162   0.95586   1.1623   -0.42502   0.15486   2.2326   -0.31339
 -0.030228  0.79802  -0.41302   0.72885   0.7296   -0.31909   0.8956
  0.34625   0.2923    0.40056   0.78985  -0.43999   0.24698  -0.46548
  0.055886 -0.62603  -0.036487 -0.65429   0.10563   0.17435   0.35466
 -1.9403   -0.022502 -0.7302   -0.63042  -0.032799 -0.43953  -0.07239
 -0.44875  -0.074689 -0.14426   0.

In [20]:
# === 6. FastText ===
fasttext_model = FastText(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4)
fasttext_model.save("fasttext.model")

print("FastText - Most similar to 'film':")
print(fasttext_model.wv.most_similar('film'))

FastText - Most similar to 'film':
[('ilm', 0.9890522956848145), ('filmy', 0.9821617603302002), ('filmthe', 0.9778387546539307), ('filmwithinafilm', 0.9731557369232178), ('films', 0.9724718332290649), ('filmdom', 0.9723955392837524), ('bfilms', 0.9682489633560181), ('filmnoir', 0.9678707718849182), ('movies', 0.9574015140533447), ('filmic', 0.955020546913147)]
