In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv


## 1. Extract the corpus: Movie Plots

In [2]:
df=pd.read_csv("/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
corpus=df["Plot"][:10000].to_list()
len(corpus)
corpus[3]

'Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading "His Photographer" and "His Press Agent" respectively, follow him into the shot; the photographer sets up his camera. "Teddy" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. "Teddy" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. "Teddy" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.'

## 2. Next up is Preprocessing/Cleaning

### Import the tokenizer

In [5]:
from nltk.tokenize import word_tokenize

### Preprocess text (to lowercase, remove punctuations and tokenize

In [6]:
import string
tokens=[]
for i in range(len(corpus)):
    temp=corpus[i].lower()
    corpus[i]=''.join([char for char in temp if char not in string.punctuation])
    tokens.extend(word_tokenize(corpus[i]))

In [7]:
print(len(tokens))

3306739


### Record word frequency and create word index

In [8]:
word_freq={}
word2idx={}
count_idx=0 #this was the source of the error sigh

for token in tokens:
    if token not in word2idx:
        word2idx[token]=count_idx
        word_freq[token]=1
        count_idx+=1
    else:
        word_freq[token]+=1

In [9]:
print(len(word2idx))

74561


In [10]:
idx2word={idx:word for word,idx in word2idx.items()}

# trying to debug the bad context index error

In [11]:
print(idx2word[0])

a


### Now I see that the range goes from 0 to 17766 and that vocab size is 17767. The problem was wrong vocab size. I started the index in word2idx from 1 not 0 but was requiring 0 in other places.

In [12]:
print(word2idx["mounties"])

17765


### 3. Next up is Generating Train Pairs (word-context pairs)

In [13]:
window_size=2
training_pairs=[]
for i in range(len(tokens)):
    start=i-window_size
    stop=i+window_size+1
    
    for j in range(start,stop):
        if j<0 or j==i or j>=len(tokens):
            continue
        training_pairs.append((tokens[i],tokens[j]))   

In [14]:
print(training_pairs[:50])

[('a', 'bartender'), ('a', 'is'), ('bartender', 'a'), ('bartender', 'is'), ('bartender', 'working'), ('is', 'a'), ('is', 'bartender'), ('is', 'working'), ('is', 'at'), ('working', 'bartender'), ('working', 'is'), ('working', 'at'), ('working', 'a'), ('at', 'is'), ('at', 'working'), ('at', 'a'), ('at', 'saloon'), ('a', 'working'), ('a', 'at'), ('a', 'saloon'), ('a', 'serving'), ('saloon', 'at'), ('saloon', 'a'), ('saloon', 'serving'), ('saloon', 'drinks'), ('serving', 'a'), ('serving', 'saloon'), ('serving', 'drinks'), ('serving', 'to'), ('drinks', 'saloon'), ('drinks', 'serving'), ('drinks', 'to'), ('drinks', 'customers'), ('to', 'serving'), ('to', 'drinks'), ('to', 'customers'), ('to', 'after'), ('customers', 'drinks'), ('customers', 'to'), ('customers', 'after'), ('customers', 'he'), ('after', 'to'), ('after', 'customers'), ('after', 'he'), ('after', 'fills'), ('he', 'customers'), ('he', 'after'), ('he', 'fills'), ('he', 'a'), ('fills', 'after')]


### 4. Next up is converting words to indices

In [15]:
training_data = []
for tup in training_pairs:
    training_data.append((word2idx[tup[0]],word2idx[tup[1]]))

In [16]:
training_data[:50]

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 0),
 (4, 2),
 (4, 3),
 (4, 0),
 (4, 5),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (5, 4),
 (5, 0),
 (5, 6),
 (5, 7),
 (6, 0),
 (6, 5),
 (6, 7),
 (6, 8),
 (7, 5),
 (7, 6),
 (7, 8),
 (7, 9),
 (8, 6),
 (8, 7),
 (8, 9),
 (8, 10),
 (9, 7),
 (9, 8),
 (9, 10),
 (9, 11),
 (10, 8),
 (10, 9),
 (10, 11),
 (10, 12),
 (11, 9),
 (11, 10),
 (11, 12),
 (11, 0),
 (12, 10)]

In [17]:
word2idx["after"]

10

In [18]:
idx2word[121]

'hidden'

In [19]:
vocab_size=len(word2idx) #number of unique words in the corpus
print(vocab_size)
embedding_dim=100 #a usual value for Word2Vec

74561


In [20]:
import torch

### Define the Embedding Layers and Word2Vec Class

In [21]:
W_in=torch.nn.Embedding(vocab_size,embedding_dim)
W_out=torch.nn.Embedding(vocab_size,embedding_dim)

**Quick note: 1. Just learnt that if I don't feed in the indices in batch and instead try to loop through each one, it'll be too slow. 2. Also learnt the purpose of forward() in a pytorch class - it is the brain that handles input.**

In [22]:
class Word2Vec(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = torch.nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self,centre_idx, context_idx):
        centre_vector=self.in_embed(centre_idx)
        context_vector=self.out_embed(context_idx)
        dot_products = torch.sum(centre_vector * context_vector, dim=1)
        return dot_products
        # torch.dot only works for 1D vectors not batches
        # The dot product is the true measure of if the vectors capture the closeness of the words they represent.
        # Seeing as we're starting with random values, the predictions will start poor and over time will be adjusted through training. Interesting.

### Adding negative sampling to the training data

In [23]:
import random
random.sample(training_data,1)[0]

(175, 122)

In [24]:
k=5

def prepare_batch_with_negatives(batch_data, k, vocab_size):
    centre_indices=[]
    context_indices=[]
    labels=[]
    for centre_idx, context_idx in batch_data:
        centre_indices.append(centre_idx)
        context_indices.append(context_idx)
        labels.append(1)
        
        for nve_sample in range(k):
            negative_idx = random.randint(0, vocab_size - 1)
            while negative_idx == context_idx:
                negative_idx = random.randint(0, vocab_size - 1)
            centre_indices.append(centre_idx)
            context_indices.append(negative_idx)
            labels.append(0)
            
    centre_tensor = torch.tensor(centre_indices, dtype=torch.long)
    context_tensor = torch.tensor(context_indices, dtype=torch.long)
    label_tensor   = torch.tensor(labels, dtype=torch.float)
    return centre_tensor, context_tensor, label_tensor

In [25]:
model = Word2Vec(vocab_size, embedding_dim)

In [26]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCELoss()

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Word2Vec(
  (in_embed): Embedding(74561, 100)
  (out_embed): Embedding(74561, 100)
)

### Loss calculation is two: first is loss per batch and then loss per epoch which averages across batches in an epoch.

In [28]:
num_epochs=25
batch_size=100

for epoch in range(num_epochs):
    random.shuffle(training_data)
    total_loss=0

    for i in range(0, len(training_data), batch_size):
        batch_data = training_data[i : i + batch_size]
        
        for center_idx, context_idx in batch_data:
            assert center_idx < vocab_size, f"Bad center_idx: {center_idx}"
            assert context_idx < vocab_size, f"Bad context_idx: {context_idx}"
            
        center_tensor, context_tensor, label_tensor = prepare_batch_with_negatives(batch_data, k, vocab_size)
        # print(center_tensor.max(), context_tensor.max(), vocab_size)
        center_tensor = center_tensor.to(device)
        context_tensor = context_tensor.to(device)
        label_tensor = label_tensor.to(device)
        # Forward pass
        dot_products = model(center_tensor, context_tensor)
        predictions = torch.sigmoid(dot_products)

        # Compute loss
        loss = loss_fn(predictions, label_tensor)
        total_loss += loss.item()
        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch+1}, Loss: {total_loss / (len(training_data) // batch_size):.4f}")

Epoch 1, Loss: 6.1327
Epoch 1, Loss: 7.0067
Epoch 2, Loss: 5.6022
Epoch 2, Loss: 6.4946
Epoch 3, Loss: 5.1511
Epoch 3, Loss: 6.1165
Epoch 4, Loss: 5.1230
Epoch 4, Loss: 5.8583
Epoch 5, Loss: 4.9864
Epoch 5, Loss: 5.6634
Epoch 6, Loss: 5.7705
Epoch 6, Loss: 5.5045
Epoch 7, Loss: 5.4661
Epoch 7, Loss: 5.3807
Epoch 8, Loss: 3.7133
Epoch 8, Loss: 5.2822
Epoch 9, Loss: 5.4758
Epoch 9, Loss: 5.1956
Epoch 10, Loss: 7.0391
Epoch 10, Loss: 5.1223
Epoch 11, Loss: 4.3136
Epoch 11, Loss: 5.0490
Epoch 12, Loss: 5.0255
Epoch 12, Loss: 4.9874
Epoch 13, Loss: 5.0535
Epoch 13, Loss: 4.9259
Epoch 14, Loss: 4.1857
Epoch 14, Loss: 4.8735
Epoch 15, Loss: 6.2504
Epoch 15, Loss: 4.8269
Epoch 16, Loss: 3.6495
Epoch 16, Loss: 4.7871
Epoch 17, Loss: 3.5511
Epoch 17, Loss: 4.7450
Epoch 18, Loss: 4.1976
Epoch 18, Loss: 4.7122
Epoch 19, Loss: 4.6436
Epoch 19, Loss: 4.6767
Epoch 20, Loss: 6.6524
Epoch 20, Loss: 4.6491
Epoch 21, Loss: 5.2884
Epoch 21, Loss: 4.6155
Epoch 22, Loss: 3.9520
Epoch 22, Loss: 4.5838
Epoch 

In [29]:
word_vectors = model.in_embed.weight.data  # Tensor: [vocab_size, embedding_dim]
word = "add"  # Example
idx = word2idx[word]
target_vector = word_vectors[idx]

In [30]:
cosine_similarities = torch.nn.functional.cosine_similarity(
    target_vector.unsqueeze(0), word_vectors, dim=1
)

In [31]:
top_indices = torch.topk(cosine_similarities, k=10).indices.tolist()
similar_words = [idx2word[i] for i in top_indices]
print(similar_words)

['add', 'bigamist', 'contested', 'prokoszny', 'lots', 'tada', 'summer', 'stokowskis', 'hussars', 'görings']


In [32]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

# tsne = TSNE(n_components=2, random_state=0)
# word_vectors_cpu = word_vectors.cpu().numpy()
# embeddings_2d = tsne.fit_transform(word_vectors_cpu)

# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
# plt.show()