# Developing a Movie Recommendation System using spaCy and NLP

## Getting Started
To begin, ensure you have spaCy installed in your environment:

In [2]:
# !pip install spacy
!pip3 install spacy



In [1]:
# !python -m spacy download en_core_web_md
# !python3 -m spacy download en_core_web_md
# !python3 -m spacy download en_core_web_lg

# Phase 1: Data Preparation

First, download the dataset from Kaggle, specifically the `movies_metadata.csv` file.

## Loading the Dataset

In [3]:
import pandas as pd
# Load the dataset
movies_metadata = pd.read_csv('./dataset/movies_metadata.csv', low_memory=False)


In [4]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [6]:
movies_metadata.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

## Initial Data Cleaning

In [7]:
# Selecting relevant columns and removing missing values
movies_metadata = movies_metadata[['title', 'overview']].dropna()

In [8]:
movies_metadata.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


# Phase 2: NLP Preprocessing

We'll use spaCy for text processing to prepare our movie descriptions.

## Preprocessing Function

In [9]:
import spacy
nlp = spacy.load('en_core_web_lg')

def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

# Function to convert preprocessed text to a SpaCy doc (which inherently contains a vector)
def get_doc(text):
    return nlp(text)

## Applying Preprocessing

In [10]:
# This takes many minutes depending on internet speed (rather run the next cell)
# movies_metadata['processed_overview'] = movies_metadata['overview'].apply(preprocess)
# movies_metadata['doc'] = movies_metadata['processed_overview'].apply(get_doc)

In [11]:
# Only use a small subset of the data

movies_metadata = movies_metadata.sample(5000, random_state=42)

movies_metadata.head()

Unnamed: 0,title,overview
18072,Undertow,An unusual ghost story set on the Peruvian sea...
5076,Burial Ground,Professor Ayres discovers a secret in an ancie...
40871,Diggers,A couple hires a professional digger (undergro...
7695,College,"To reconcile with his girlfriend, a bookish co..."
24381,Charlie Chan at Treasure Island,Charlie's investigation of a phony psychic dur...


In [12]:
# A bit quicker with less accuracy

movies_metadata['processed_overview'] = movies_metadata['overview'].apply(preprocess)
movies_metadata['doc'] = movies_metadata['processed_overview'].apply(get_doc)

In [13]:
movies_metadata.head()

Unnamed: 0,title,overview,processed_overview,doc
18072,Undertow,An unusual ghost story set on the Peruvian sea...,unusual ghost story set peruvian seaside marri...,"(unusual, ghost, story, set, peruvian, seaside..."
5076,Burial Ground,Professor Ayres discovers a secret in an ancie...,professor ayres discover secret ancient stone ...,"(professor, ayres, discover, secret, ancient, ..."
40871,Diggers,A couple hires a professional digger (undergro...,couple hire professional digger underground st...,"(couple, hire, professional, digger, undergrou..."
7695,College,"To reconcile with his girlfriend, a bookish co...",reconcile girlfriend bookish college student t...,"(reconcile, girlfriend, bookish, college, stud..."
24381,Charlie Chan at Treasure Island,Charlie's investigation of a phony psychic dur...,charlie investigation phony psychic 1939 world...,"(charlie, investigation, phony, psychic, 1939,..."


# Phase 3: Building the Recommendation System

## Recommendation Function

In [14]:
# Recommendation function using SpaCy's similarity
def recommend(input_description, n_recommendations=5):
    preprocessed_input = preprocess(input_description)
    input_doc = nlp(preprocessed_input)
    
    # Calculate similarity scores
    movies_metadata['similarity'] = movies_metadata['doc'].apply(lambda doc: input_doc.similarity(doc))
    
    # Sort movies by similarity score in descending order
    recommendations = movies_metadata[['title', 'similarity']].sort_values(by='similarity', ascending=False).head(n_recommendations)
    
    return recommendations

# Phase 4: Evaluation and Refinement

In [15]:
# Get recommendations based on a few movie descriptions
recommend("A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world.")

  movies_metadata['similarity'] = movies_metadata['doc'].apply(lambda doc: input_doc.similarity(doc))


Unnamed: 0,title,similarity
42901,Power Rangers,0.894084
28736,8 Minutes Idle,0.885651
40272,Refuge,0.884131
34981,A Few Dollars for Django,0.879359
32048,Tiny Times,0.877829


In [16]:
# Get another recommendation
recommend("A young lion prince is cast out of his pride by his cruel uncle, who claims he killed his father. While the uncle rules with an iron paw, the prince grows up beyond the Savannah, living by a philosophy: No worries for the rest of your days.")

  movies_metadata['similarity'] = movies_metadata['doc'].apply(lambda doc: input_doc.similarity(doc))


Unnamed: 0,title,similarity
22724,Samson and Delilah,0.878149
5180,Behind the Sun,0.87002
16827,Red Riding Hood,0.858906
42075,The Salt Prince,0.857916
12263,Robin Hood,0.854068


In [17]:
# Get another recommendation
recommend("A superhero from Krypton is sent to Earth to protect it from evil forces.")

  movies_metadata['similarity'] = movies_metadata['doc'].apply(lambda doc: input_doc.similarity(doc))


Unnamed: 0,title,similarity
23951,Age of Tomorrow,0.82977
15006,Clash of the Titans,0.821896
6760,Captain Kronos: Vampire Hunter,0.804156
19400,Deathstalker II,0.801183
23347,Atomic Rulers,0.79173


In [21]:
desired_overview = "A superhero from Krypton is sent to Earth to protect it from evil forces."
do_doc = nlp(preprocess(desired_overview))

overview2 = "A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world."
do_doc2 = nlp(preprocess(overview2))

In [22]:
do_doc.similarity(do_doc2)

0.701706649875278