In [16]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from gensim.models import Word2Vec
from annoy import AnnoyIndex
from gensim.models import KeyedVectors

In [2]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')

In [3]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
df.rename(columns={'Title': 'title', 'Plot': 'plot'}, inplace=True)

In [5]:
df = df[['title', 'plot']]

In [6]:
df.dropna(subset=['title', 'plot'], inplace=True)

In [7]:
df['title'] = df['title'].str.lower().str.strip()

In [8]:
def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    tagged_tokens = pos_tag(tokens)
    stop_words = set(stopwords.words('english'))
    punct = string.punctuation
    return [
        (word, tag) for word, tag in tagged_tokens
        if word not in stop_words and word not in punct and not word.isdigit()
    ]

In [9]:
df['plot_cleaned'] = df['plot'].apply(preprocess_text)
print("Data has been cleaned.")

Data has been cleaned.


In [10]:
corpus = [ [word for word, tag in tokens] for tokens in df['plot_cleaned'] if tokens ]
word2vec_model = Word2Vec(sentences=corpus, vector_size=150, window=5, min_count=2, workers=4)

In [11]:
vector_size = word2vec_model.wv.vector_size
def create_document_vector(tokens, model):
    words = [word for word, tag in tokens]
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors: return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [12]:
df['vector'] = df['plot_cleaned'].apply(lambda tokens: create_document_vector(tokens, word2vec_model))
print("Word2Vec model trained and vectors created.")


Word2Vec model trained and vectors created.


In [13]:
search_index = AnnoyIndex(vector_size, 'angular')
df.reset_index(drop=True, inplace=True)
for i, vector in enumerate(df['vector']):
    search_index.add_item(i, vector)
search_index.build(10)
search_index.save('movie_index.ann')

True

In [14]:
word2vec_model.save("movie_word2vec.model")

In [15]:
final_df = df[['title', 'plot']]
final_df.to_csv('final_movie_data.csv', index=False)

In [17]:
custom_model = Word2Vec.load("movie_word2vec.model")
print("- Your custom model loaded successfully.")

- Your custom model loaded successfully.


In [21]:
google_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

2025-08-30 04:50:48,309 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin
2025-08-30 04:51:33,267 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-08-30T04:51:33.267029', 'gensim': '4.3.3', 'python': '3.12.4 (tags/v3.12.4:8e8a4ba, Jun  6 2024, 19:30:16) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'load_word2vec_format'}


In [22]:
df = pd.read_csv('final_movie_data.csv')

In [23]:
VECTOR_SIZE = 300

In [24]:
def create_combined_vector(plot_text, google_model, custom_model):
    # We need to re-run the basic text cleaning on the plot text
    tokens = [word for word, tag in preprocess_text(plot_text)] # Assuming preprocess_text is in a cell above
    
    word_vectors = []
    for word in tokens:
        if word in google_model:
            word_vectors.append(google_model[word])
        elif word in custom_model.wv:
            # Pad our smaller vector to match Google's 300 dimensions
            custom_vector = custom_model.wv[word]
            padded_vector = np.pad(custom_vector, (0, VECTOR_SIZE - len(custom_vector)), 'constant')
            word_vectors.append(padded_vector)

    if not word_vectors:
        return np.zeros(VECTOR_SIZE)
    return np.mean(word_vectors, axis=0)

In [25]:
df['vector'] = df['plot'].apply(lambda text: create_combined_vector(text, google_model, custom_model))

In [26]:
search_index = AnnoyIndex(VECTOR_SIZE, 'angular')
for i, vector in enumerate(df['vector']):
    search_index.add_item(i, vector)

search_index.build(10)
search_index.save('movie_index_combined.ann')

True

In [27]:
final_df = df[['title', 'plot']]
final_df.to_csv('final_movie_data_combined.csv', index=False) 