In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv


## 1. Extract the corpus: Movie Plots

In [2]:
df=pd.read_csv("/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
corpus=df["Plot"][:15000].to_list()
len(corpus)
corpus[3]

'Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading "His Photographer" and "His Press Agent" respectively, follow him into the shot; the photographer sets up his camera. "Teddy" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. "Teddy" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. "Teddy" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.'

## 2. Next up is Preprocessing/Cleaning

### Import the tokenizer

In [5]:
from nltk.tokenize import word_tokenize

### Preprocess text (to lowercase, remove punctuations and tokenize

In [6]:
import string
tokens=[]
for i in range(len(corpus)):
    temp=corpus[i].lower()
    corpus[i]=''.join([char for char in temp if char not in string.punctuation])
    tokens.extend(word_tokenize(corpus[i]))

In [7]:
print(len(tokens))

5887100


### Record word frequency and create word index

In [8]:
word_freq={}
word2idx={}
count_idx=1

for token in tokens:
    if token not in word2idx:
        word2idx[token]=count_idx
        word_freq[token]=1
        count_idx+=1
    else:
        word_freq[token]+=1

In [9]:
print(len(word2idx))

102299


In [10]:
idx2word={idx:word for word,idx in word2idx.items()}

In [11]:
print(idx2word[100])

left


In [12]:
print(word2idx["left"])

100


## 3. Next up is Generating Train Pairs (word-context pairs)

In [13]:
window_size=2
training_pairs=[]
for i in range(len(tokens)):
    start=i-window_size
    stop=i+window_size+1
    
    for j in range(start,stop):
        if j<0 or j==i or j>=len(tokens):
            continue
        training_pairs.append((tokens[i],tokens[j]))   

In [14]:
print(training_pairs[:50])

[('a', 'bartender'), ('a', 'is'), ('bartender', 'a'), ('bartender', 'is'), ('bartender', 'working'), ('is', 'a'), ('is', 'bartender'), ('is', 'working'), ('is', 'at'), ('working', 'bartender'), ('working', 'is'), ('working', 'at'), ('working', 'a'), ('at', 'is'), ('at', 'working'), ('at', 'a'), ('at', 'saloon'), ('a', 'working'), ('a', 'at'), ('a', 'saloon'), ('a', 'serving'), ('saloon', 'at'), ('saloon', 'a'), ('saloon', 'serving'), ('saloon', 'drinks'), ('serving', 'a'), ('serving', 'saloon'), ('serving', 'drinks'), ('serving', 'to'), ('drinks', 'saloon'), ('drinks', 'serving'), ('drinks', 'to'), ('drinks', 'customers'), ('to', 'serving'), ('to', 'drinks'), ('to', 'customers'), ('to', 'after'), ('customers', 'drinks'), ('customers', 'to'), ('customers', 'after'), ('customers', 'he'), ('after', 'to'), ('after', 'customers'), ('after', 'he'), ('after', 'fills'), ('he', 'customers'), ('he', 'after'), ('he', 'fills'), ('he', 'a'), ('fills', 'after')]
