In [1]:
import pandas as pd
import json
import os
import transformers
import sklearn
import csv

In [None]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Extract json movie descriptions

all_data = []

for filename in os.listdir('extracted_content_ml-latest'):
    if filename.endswith('.json'):
        # List all json files in the directory
        filepath = os.path.join('extracted_content_ml-latest', filename)
        with open(filepath, 'r') as f:
            try:
                data = json.load(f)
                # Load only Movielens data
                if 'movielensId' in data:
                    all_data.append(data['movielens'])
                
            except json.JSONDecodeError:
                print(f"Error decoding JSON from file: {filepath}")
                continue
                

## Data preprocessing

In [4]:
# Cleaning up the data
movie_content_df = pd.DataFrame(all_data)

# Set movieId as index
movie_content_df.set_index('movieId', inplace=True)
movie_content_df.sort_index(inplace=True)

# Get rid of useless columns
columns_to_drop = {'dvdReleaseDate', 'imdbMovieId', 
                   'youtubeTrailerIds', 'posterPath', 'numRatings'}
movie_content_df.drop(columns=columns_to_drop, inplace=True)



movie_content_df.head()

Unnamed: 0_level_0,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,tmdbMovieId,avgRating,releaseYear
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,[English],1995-10-30,[John Lasseter],81.0,Toy Story,G,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",862,3.88694,1995
2,"[English, Français]",1995-12-15,[Joe Johnston],104.0,Jumanji,PG,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,8844,3.24265,1995
3,[English],1995-12-22,[Howard Deutch],101.0,Grumpier Old Men,PG-13,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,15602,3.17358,1995
4,[English],1995-12-22,[Forest Whitaker],127.0,Waiting to Exhale,R,"[Whitney Houston, Angela Bassett, Loretta Devi...",Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",31357,2.87513,1995
5,[English],1995-02-10,[Charles Shyer],106.0,Father of the Bride Part II,PG,"[Steve Martin, Diane Keaton, Martin Short, Kim...",Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,11862,3.07819,1995


In [10]:
movie_content_df.to_csv('movie_content_df.csv', quoting=csv.QUOTE_NONNUMERIC)

### Extract the embedding out of plotSummary

In [5]:
# Get the plot summary for every movie
plot_summary = movie_content_df['plotSummary']
plot_summary = plot_summary.fillna('')

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Mean Pooling the embeddings (no [CLS] token)
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)



In [17]:
# Sentences we want sentence embeddings for
sentences = plot_summary.to_list()
batch_size = 16
all_sentences_embeddings = []

# Get the embeddings batch by batch
for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i+batch_size]

    # Tokenize sentences
    encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')

    # Move the encoded input to device
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform mean pooling of the embeddings
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).cpu()

    all_sentences_embeddings.append(sentence_embeddings)





In [18]:
all_embeddings = torch.cat(all_sentences_embeddings, dim=0)
embeddings_np = all_embeddings.numpy()

embedding_df = pd.DataFrame(
    embeddings_np, 
    index=movie_content_df.index  # Using the movie IDs as index
)

In [None]:
# Save the embedding
embedding_df.to_csv('movie_plot_embeddings.csv')

### Compute the similarity between the movie title embeddings

In [6]:
plot_embeddings_df = pd.read_csv('movie_plot_embeddings.csv')
plot_embeddings_df.set_index('movieId', inplace=True)

In [None]:
id_mapping = movie_content_df['tmdbMovieId'].to_dict()
tmdb_embeddings_df = plot_embeddings_df.copy
tmdb_embeddings_df.index = tmdb_embeddings_df.index.map(lambda x: id_mapping.get(x))

{1: 862,
 2: 8844,
 3: 15602,
 4: 31357,
 5: 11862,
 6: 949,
 7: 11860,
 8: 45325,
 9: 9091,
 10: 710,
 11: 9087,
 12: 12110,
 13: 21032,
 14: 10858,
 15: 1408,
 16: 524,
 17: 4584,
 18: 5,
 19: 9273,
 20: 11517,
 21: 8012,
 22: 1710,
 23: 9691,
 24: 12665,
 25: 451,
 26: 16420,
 27: 9263,
 28: 17015,
 29: 902,
 30: 37557,
 31: 9909,
 32: 63,
 33: 78802,
 34: 9598,
 35: 47018,
 36: 687,
 37: 139405,
 38: 33689,
 39: 9603,
 40: 34615,
 41: 31174,
 42: 11443,
 43: 35196,
 44: 9312,
 45: 577,
 46: 11861,
 47: 807,
 48: 10530,
 49: 8391,
 50: 629,
 51: 117164,
 52: 11448,
 53: 49133,
 54: 26441,
 55: 97406,
 56: 124057,
 57: 9089,
 58: 11010,
 59: 99040,
 60: 11359,
 61: 17182,
 62: 2054,
 63: 10607,
 64: 19760,
 65: 9536,
 66: 11525,
 67: 40628,
 68: 4482,
 69: 10634,
 70: 755,
 71: 11859,
 72: 28387,
 73: 48750,
 74: 20927,
 75: 36929,
 76: 9102,
 77: 124626,
 78: 27526,
 79: 9623,
 80: 46785,
 81: 400,
 82: 880,
 83: 146599,
 84: 188588,
 85: 8447,
 86: 10534,
 87: 17414,
 88: 13997,
 8

In [12]:
plot_embeddings_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.063439,0.001027,0.09321,-0.014942,-0.0064,0.015289,0.123085,-0.030446,-0.03718,0.021572,...,0.025831,-0.101288,-0.070295,-0.001306,-0.022086,-0.007595,0.038765,0.015355,0.044632,0.022023
2,0.086306,0.044615,-0.040496,-0.052533,0.002639,0.07536,0.046538,-0.056587,0.000593,0.045341,...,0.066789,-0.037893,-0.061075,0.062395,-0.01608,0.021236,0.008645,-0.003295,-0.029277,-0.02655
3,-0.100876,0.037442,-0.000925,-0.046489,-0.131993,0.026747,0.016162,-0.015781,-0.022503,-0.101355,...,-0.038098,-0.04149,0.074784,0.088424,0.049767,0.013877,0.022851,0.056832,-0.02618,0.018336
4,-0.055419,-0.014512,0.031433,0.042474,0.051593,-0.005846,0.046883,-0.10101,-0.028232,-0.006597,...,0.012452,-0.066199,-0.058257,0.031873,-0.007548,-0.002243,-0.000803,-0.093868,0.026814,-0.015709
5,-0.031386,-0.069306,0.064619,0.024486,0.021227,-0.016395,0.060222,0.003836,-0.009017,0.00612,...,0.052819,-0.032482,-0.071496,-0.049393,0.040337,0.028984,-0.010484,-0.001167,0.037698,-0.067942


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc

embedding_matrix = plot_embeddings_df.values
n_samples = embedding_matrix.shape[0]
batch_size = 100  # Adjust this based on your available memory

# Initialize an empty similarity matrix
similarity_matrix = np.zeros((n_samples, n_samples))

# Compute similarity matrix in batches
for i in range(0, n_samples, batch_size):
    end_i = min(i + batch_size, n_samples)
    batch_i = embedding_matrix[i:end_i]
    
    for j in range(0, n_samples, batch_size):
        end_j = min(j + batch_size, n_samples)
        batch_j = embedding_matrix[j:end_j]
        
        # Compute cosine similarity for this batch
        batch_similarity = cosine_similarity(batch_i, batch_j)
        
        # Store in the full similarity matrix
        similarity_matrix[i:end_i, j:end_j] = batch_similarity
    
    # Force garbage collection after each major batch
    _ = gc.collect()
    print(f"Processed {end_i}/{n_samples} rows")

# The resulting similarity_matrix will be the full similarity matrix

Processed 100/1000 rows
Processed 200/1000 rows
Processed 300/1000 rows
Processed 400/1000 rows
Processed 500/1000 rows
Processed 600/1000 rows
Processed 700/1000 rows
Processed 800/1000 rows
Processed 900/1000 rows
Processed 1000/1000 rows
