In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv


## 1. Extract the corpus: Movie Plots

In [2]:
df=pd.read_csv("/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
corpus=df["Plot"][:15000].to_list()
len(corpus)
corpus[3]

'Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading "His Photographer" and "His Press Agent" respectively, follow him into the shot; the photographer sets up his camera. "Teddy" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. "Teddy" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. "Teddy" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.'

## 2. Next up is Preprocessing/Cleaning

### Import the tokenizer

In [5]:
from nltk.tokenize import word_tokenize

### Preprocess text (to lowercase, remove punctuations and tokenize

In [6]:
import string
tokens=[]
for i in range(len(corpus)):
    temp=corpus[i].lower()
    corpus[i]=''.join([char for char in temp if char not in string.punctuation])
    tokens.extend(word_tokenize(corpus[i]))

In [7]:
print(len(tokens))

5887100


### Record word frequency and create word index

In [8]:
word_freq={}
word2idx={}
count_idx=1

for token in tokens:
    if token not in word2idx:
        word2idx[token]=count_idx
        word_freq[token]=1
        count_idx+=1
    else:
        word_freq[token]+=1

In [9]:
print(len(word2idx))

102299


In [10]:
idx2word={idx:word for word,idx in word2idx.items()}

In [11]:
print(idx2word[100])

left


In [12]:
print(word2idx["left"])

100


### 3. Next up is Generating Train Pairs (word-context pairs)

In [13]:
window_size=2
training_pairs=[]
for i in range(len(tokens)):
    start=i-window_size
    stop=i+window_size+1
    
    for j in range(start,stop):
        if j<0 or j==i or j>=len(tokens):
            continue
        training_pairs.append((tokens[i],tokens[j]))   

In [14]:
print(training_pairs[:50])

[('a', 'bartender'), ('a', 'is'), ('bartender', 'a'), ('bartender', 'is'), ('bartender', 'working'), ('is', 'a'), ('is', 'bartender'), ('is', 'working'), ('is', 'at'), ('working', 'bartender'), ('working', 'is'), ('working', 'at'), ('working', 'a'), ('at', 'is'), ('at', 'working'), ('at', 'a'), ('at', 'saloon'), ('a', 'working'), ('a', 'at'), ('a', 'saloon'), ('a', 'serving'), ('saloon', 'at'), ('saloon', 'a'), ('saloon', 'serving'), ('saloon', 'drinks'), ('serving', 'a'), ('serving', 'saloon'), ('serving', 'drinks'), ('serving', 'to'), ('drinks', 'saloon'), ('drinks', 'serving'), ('drinks', 'to'), ('drinks', 'customers'), ('to', 'serving'), ('to', 'drinks'), ('to', 'customers'), ('to', 'after'), ('customers', 'drinks'), ('customers', 'to'), ('customers', 'after'), ('customers', 'he'), ('after', 'to'), ('after', 'customers'), ('after', 'he'), ('after', 'fills'), ('he', 'customers'), ('he', 'after'), ('he', 'fills'), ('he', 'a'), ('fills', 'after')]


### 4. Next up is converting words to indices

In [15]:
train_data = []
for tup in training_pairs:
    train_data.append((word2idx[tup[0]],word2idx[tup[1]]))

In [16]:
train_data[:50]

[(1, 2),
 (1, 3),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 5),
 (4, 2),
 (4, 3),
 (4, 5),
 (4, 1),
 (5, 3),
 (5, 4),
 (5, 1),
 (5, 6),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (6, 5),
 (6, 1),
 (6, 7),
 (6, 8),
 (7, 1),
 (7, 6),
 (7, 8),
 (7, 9),
 (8, 6),
 (8, 7),
 (8, 9),
 (8, 10),
 (9, 7),
 (9, 8),
 (9, 10),
 (9, 11),
 (10, 8),
 (10, 9),
 (10, 11),
 (10, 12),
 (11, 9),
 (11, 10),
 (11, 12),
 (11, 13),
 (12, 10),
 (12, 11),
 (12, 13),
 (12, 1),
 (13, 11)]

In [17]:
word2idx["after"]

11

## Now to training with Pytorch

In [18]:
vocab_size=len(word2idx) #number of unique words in the corpus
embedding_dim=100 #a usual value for Word2Vec

In [19]:
import torch

### Define the Embedding Layers and Word2Vec Class

In [20]:
W_in=torch.nn.Embedding(vocab_size,embedding_dim)
W_out=torch.nn.Embedding(vocab_size,embedding_dim)

**Quick note: 1. Just learnt that if I don't feed in the indices in batch and instead try to loop through each one, it'll be too slow. 2. Also learnt the purpose of forward() in a pytorch class - it is the brain that handles input.**

In [21]:
class Word2Vec(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.in_embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = torch.nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self,centre_idx, context_idx):
        centre_vector=self.in_embed(centre_idx)
        context_vector=self.out_embed(context_idx)
        dot_products = torch.sum(centre_vector * context_vector, dim=1)
        return dot_products
        # torch.dot only works for 1D vectors not batches
        # The dot product is the true measure of if the vectors capture the closeness of the words they represent.
        # Seeing as we're starting with random values, the predictions will start poor and over time will be adjusted through training. Interesting.

### Adding negative sampling to the training data

In [22]:
import random
random.sample(train_data,1)[0]

(1866, 1435)

In [23]:
k=5

centre_indices=[]
context_indices=[]
labels=[]

def prepare_batch_with_negatives(batch_data, k, vocab_size):
    for centre_idx, context_idx in batch_data:
        centre_idx, context_idx = sample_pair
        centre_indices.append(centre_idx)
        context_indices.append(context_idx)
        labels.append(1)
        
        for nve_sample in k:
            negative_idx = random.randint(0, vocab_size - 1)
            while negative_idx == context_idx:
                negative_idx = random.randint(0, vocab_size - 1)
            centre_indices.append(center_idx)
            context_indices.append(negative_idx)
            labels.append(0)
            
    centre_tensor = torch.tensor(centre_indices, dtype=torch.long)
    context_tensor = torch.tensor(context_indices, dtype=torch.long)
    label_tensor   = torch.tensor(labels, dtype=torch.float)
    return centre_tensor, context_tensor, label_tensor