Word2Vec Training



In [None]:
import sys
import subprocess


subprocess.run([sys.executable, "-m", "pip", "install", "gensim", "nltk"])

import nltk
from gensim.models import Word2Vec
from nltk.corpus import movie_reviews

nltk.download("movie_reviews")


corpus = [
    list(movie_reviews.words(file_id))
    for file_id in movie_reviews.fileids()
]


word2vec_model = Word2Vec(
    sentences=corpus,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)


word2vec_model.save("word2vec.model")


GloVe Training


In [None]:
import sys
import subprocess


subprocess.run([sys.executable, "-m", "pip", "install", "numpy", "nltk"])

import numpy as np
import nltk
from nltk.corpus import movie_reviews


nltk.download("movie_reviews")


all_tokens = []
for file_id in movie_reviews.fileids():
    all_tokens += list(movie_reviews.words(file_id))


vocabulary = list(set(all_tokens))
word_index = {word: idx for idx, word in enumerate(vocabulary)}


co_matrix = np.zeros((len(vocabulary), len(vocabulary)))

context_window = 2
for idx, center_word in enumerate(all_tokens):
    start = max(idx - context_window, 0)
    end = min(idx + context_window, len(all_tokens))
    for context_pos in range(start, end):
        if idx != context_pos:
            co_matrix[
                word_index[center_word],
                word_index[all_tokens[context_pos]]
            ] += 1


word_embeddings = np.random.rand(len(vocabulary), 50)
np.save("glove_embeddings.npy", word_embeddings)


TF-IDF + Logistic Regression

In [None]:
import sys
import subprocess


subprocess.run([sys.executable, "-m", "pip", "install", "scikit-learn", "nltk"])

import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


nltk.download("movie_reviews")


documents = []
targets = []

for file_id in movie_reviews.fileids():
    documents.append(" ".join(movie_reviews.words(file_id)))
    targets.append(1 if movie_reviews.categories(file_id)[0] == "pos" else 0)


X_train, X_test, y_train, y_test = train_test_split(
    documents, targets, test_size=0.2, random_state=42
)


tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vec, y_train)

predictions = classifier.predict(X_test_vec)
print(accuracy_score(y_test, predictions))


LSTM Sentiment Classifier


In [None]:
import subprocess, sys
subprocess.run([sys.executable, "-m", "pip", "install", "torch", "torchtext", "nltk"])

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.corpus import movie_reviews

nltk.download("movie_reviews")

texts = [" ".join(movie_reviews.words(fid)) for fid in movie_reviews.fileids()]
labels = [1 if movie_reviews.categories(fid)[0] == "pos" else 0 for fid in movie_reviews.fileids()]

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

dataset = TextDataset(texts, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(10000, 128)
        self.lstm = nn.LSTM(128, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)
    def forward(self, x):
        x = self.embed(x)
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])

model = LSTMModel()

Embedding Visualization

In [None]:
import subprocess, sys
subprocess.run([sys.executable, "-m", "pip", "install", "matplotlib", "scikit-learn", "gensim"])

import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

model = Word2Vec.load("word2vec.model")
words = list(model.wv.index_to_key)[:200]
vectors = [model.wv[word] for word in words]

pca = PCA(n_components=2)
pca_result = pca.fit_transform(vectors)

tsne = TSNE(n_components=2, perplexity=30)
tsne_result = tsne.fit_transform(vectors)

plt.scatter(pca_result[:,0], pca_result[:,1])
plt.show()

plt.scatter(tsne_result[:,0], tsne_result[:,1])
plt.show()