In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from IPython.display import display, Markdown
import sqlite3
print('SQLite version: ', sqlite3.sqlite_version)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

SQLite version:  3.37.2
/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


The dataset is retrieved from <a href="https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset">this page</a>, the description provided is as follows: </br>
**Context**

These files contain metadata for all 45,000 movies listed in the Full MovieLens Dataset. The dataset consists of movies released on or before July 2017. Data points include cast, crew, plot keywords, budget, revenue, posters, release dates, languages, production companies, countries, TMDB vote counts and vote averages.

This dataset also has files containing 26 million ratings from 270,000 users for all 45,000 movies. Ratings are on a scale of 1-5 and have been obtained from the official GroupLens website.
Content

This dataset consists of the following files:
<ul>
<li>movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.</li>

<li>keywords.csv: Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.</li>

<li>credits.csv: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.</li>

<li>links.csv: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.</li>

<li>links_small.csv: Contains the TMDB and IMDB IDs of a small subset of 9,000 movies of the Full Dataset.</li>

<li>ratings_small.csv: The subset of 100,000 ratings from 700 users on 9,000 movies.</li>
</ul>

The NLTK library provides useful tools of text preprocessing, and we'll use it for eliminating stop words (commonly used word with little specificity) and stemming (reducing words to their root):

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data

In [3]:
def format_table(column_names, rows, rows_limit = 10, spacing = 15):
    print([f"{cname : ^{spacing}}" for cname in column_names])
    print('-' * (spacing + 5) * len(column_names))
    rows_num = 0
    for row in rows:
        print([f"{entry : ^{spacing}}" for entry in row])
        rows_num += 1
        if rows_num >= rows_limit: break;

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s\d]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text_result = ' '.join(words)
    return text_result

<h2>Creating the database</h2>

Having the data in CSV form, we'll just convert them to pandas DataFrame and then to SQL tables:

In [4]:
movies_df = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
keywords_df = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
credits_df = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
links_df = pd.read_csv('/kaggle/input/the-movies-dataset/links.csv')
links_small_df = pd.read_csv('/kaggle/input/the-movies-dataset/links_small.csv')
ratings_df = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')
ratings_small_df = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')

  movies_df = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')


In [5]:
conn = sqlite3.connect('movie_db')
movies_df.to_sql('movies', conn, if_exists = 'replace', index = False)
keywords_df.to_sql('keywords', conn, if_exists = 'replace', index = False)
credits_df.to_sql('credits', conn, if_exists = 'replace', index = False)
links_df.to_sql('links', conn, if_exists = 'replace', index = False)
links_small_df.to_sql('links_small', conn, if_exists = 'replace', index = False)
ratings_df.to_sql('ratings', conn, if_exists = 'replace', index = False)
ratings_small_df.to_sql('ratings_small', conn, if_exists = 'replace', index = False)

100004

<h2>Basic Recommender</h2>

As a start we build a simple recommender that merely suggest the top rated movies in the database, according to the IMDB weigthed ratio (WR) formula:</br>
$WR = (\frac{\nu}{\nu + m})\cdot R + (\frac{m}{m + \nu})\cdot C$
where:
<ul>
    <li>$\nu$ is the number of votes for the movie.</li>
    <li>m is the minimum number of votes required to be rated.</li>
    <li>R is the average rating of the movie.</li>
    <li>C is the mean vote of all movies.</li>
</ul>
To set the value of m we require the movie vote count to be in the 95% percentile: we fetch all the vote counts sorted in ascending order, so the we just need to select the corresponding index:

In [6]:
sql_movie_count = """
SELECT vote_count
FROM movies 
WHERE vote_count IS NOT NULL
ORDER BY vote_count;
"""
cur = conn.cursor()
cur.execute(sql_movie_count)
rows = cur.fetchall()
cnames = list(map(lambda x : x[0], cur.description))
cur.close()
m = int(rows[int(0.95 * len(rows))][0])
print('Minimum number of votes required: ', m)

Minimum number of votes required:  434


In [7]:
sql_average = """
SELECT ROUND(AVG(vote_average), 2) AS avg_voting
FROM movies
WHERE vote_average IS NOT NULL;
"""
cur = conn.cursor()
cur.execute(sql_average)
rows = cur.fetchall()
cnames = list(map(lambda x : x[0], cur.description))
format_table(cnames, rows)
cur.close()
C = float(rows[0][0])

['  avg_voting   ']
--------------------
['     5.62      ']


The WR can now be easily calculated while querying in parameter, for better readibility we also:
<ul>
    <li>Select only the year of the release date as a substring.</li>
    <li>Round the popularity value to two decimal places.</li>
    <li>Extract the genres only form the *genres* column, which has a complex dictionary format cumbersome to read: to do this we take advantages of the sqlite3 possibility to create custo function to use in SQL queries and using regex. </li>
</ul>

In [8]:
def extract_genre(text):
    pattern = r"'name':\s'([a-zA-Z]+)'"
    matches = re.findall(pattern, text)
    return ', '.join(matches)
    
conn.create_function("extract_genre", 1, extract_genre)

def simple_recommender(conn, m, C, limit = 10):
    params_dict = {'minimum' : m, 'avg_voting' : C}
    cur = conn.cursor()
    sql_top_rated = """
    SELECT title AS Title, 
    extract_genre(genres) AS Genres, 
    SUBSTR(release_date, 1, 4) AS Year,
    vote_count AS Vote_Count, 
    vote_average AS Vote_Average, 
    ROUND(popularity, 2) AS Popularity,
   ROUND((vote_count / (vote_count + :minimum)) * vote_average + (:minimum / (:minimum + vote_count)) * :avg_voting, 2) AS WR
    FROM movies
    WHERE vote_count >= :minimum
    ORDER BY WR DESC;
    """
    cur.execute(sql_top_rated, params_dict)
    rows = cur.fetchall()
    cnames = list(map(lambda x : x[0], cur.description))
    format_table(cnames, rows, rows_limit = limit, spacing = 25)
    cur.close()
    return rows

rows = simple_recommender(conn = conn, m = m, C = C)

['          Title          ', '         Genres          ', '          Year           ', '       Vote_Count        ', '      Vote_Average       ', '       Popularity        ', '           WR            ']
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
['The Shawshank Redemption ', '      Drama, Crime       ', '          1994           ', '         8358.0          ', '           8.5           ', '          51.65          ', '          8.36           ']
['      The Godfather      ', '      Drama, Crime       ', '          1972           ', '         6024.0          ', '           8.5           ', '          41.11          ', '          8.31           ']
['     The Dark Knight     ', 'Drama, Action, Crime, Thriller', '          2008           ', '         12269.0         ', '           8.3           ', '         123.17          

Certainly all good movies, but vastly different in genres, our recommender would be more useul if we could at least choose the genre of movie to recommend: this is easy to accomplish by slightly modifying the simple recommender to include the genre in the conditionals used by WHERE:

In [9]:
print('check')

check


In [10]:
def recommender_with_genre(conn, m, C, genre, limit = 10):
    params_dict = {'minimum' : m, 'avg_voting' : C, 'genre' : genre}
    cur = conn.cursor()
    sql_top_rated = """
    SELECT title AS Title, 
    extract_genre(genres) AS Genres, 
    SUBSTR(release_date, 1, 4) AS Year,
    vote_count AS Vote_Count, 
    vote_average AS Vote_Average, 
    ROUND(popularity, 2) AS Popularity,
   ROUND((vote_count / (vote_count + :minimum)) * vote_average + (:minimum / (:minimum + vote_count)) * :avg_voting, 2) AS WR
    FROM movies
    WHERE (vote_count >= :minimum) AND (Genres LIKE '%'||:genre||'%')
    ORDER BY WR DESC;
    """
    cur.execute(sql_top_rated, params_dict)
    rows = cur.fetchall()
    cnames = list(map(lambda x : x[0], cur.description))
    format_table(cnames, rows, rows_limit = limit, spacing = 25)
    cur.close()
    return rows 

Let's check the top rated comedy movies:

In [11]:
rows = recommender_with_genre(conn = conn, m = m, C = C, genre = 'Comedy') 

['          Title          ', '         Genres          ', '          Year           ', '       Vote_Count        ', '      Vote_Average       ', '       Popularity        ', '           WR            ']
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
['      Forrest Gump       ', ' Comedy, Drama, Romance  ', '          1994           ', '         8147.0          ', '           8.2           ', '          48.31          ', '          8.07           ']
['    Life Is Beautiful    ', '      Comedy, Drama      ', '          1997           ', '         3643.0          ', '           8.3           ', '          39.39          ', '          8.01           ']
['    The Intouchables     ', '      Drama, Comedy      ', '          2011           ', '         5410.0          ', '           8.2           ', '          16.09          ', ' 

The recommender now is more selective!

<h2>Content-Based Recommender</h2>

Content based filtering methods use the attributes of an item to produce suggestions with similar attributes; in our case both the *tagline* and the *overview* columns provide a description/specific information for the movie, so we join them in an single *description* column in a temporary table we'll use for the recommender: this is just a smaller version of the movie table, selected with an ORDER BY clause to ensure we have a consistent order

In [12]:
sql_temp = """
DROP TABLE IF EXISTS smovies;

CREATE TEMPORARY TABLE IF NOT EXISTS smovies AS
SELECT 
m.adult, m.belongs_to_collection, m.budget, m.genres, m.homepage, m.id,
       m.imdb_id, m.original_language, m.original_title, 
       m.popularity, m.poster_path, m.production_companies,
       m.production_countries, m.release_date, m.revenue, m.runtime,
       m.spoken_languages, m.status, 
       lower(m.title) as title, 
       m.video,
       m.vote_average, m.vote_count,
m.tagline || ' ' || m.overview as description
FROM movies AS m 
INNER JOIN links_small AS sm
ON m.id = sm.tmdbId
ORDER BY m.id;
"""
cur = conn.cursor()
cur.executescript(sql_temp)
conn.commit()
cur.close()

To defined similarity between movie we'll use the TF-IDF method (Term Frequency - Inverse Document Frequency): this quantify the importance of a word in a document (in this case the description field relative to the movie ) relative to the collection of all documents.</br>
Roughly speaking, TF-IDF first defines a word importance by how often appear in a specific document, but it's then given a weight inversely proportional to the word frequency across all document: if a word is present in all descriptions is indeed not that characterizing, therefore value and its weight will be suppressed; this will already gives less importance to stop words, but we'll still make some text preprocessing with NLTK

In [13]:
cur = conn.cursor()
cur.execute("SELECT description FROM smovies;")
rows = cur.fetchall()
cur.close()
dscrptns = [row[0] for row in rows]
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
clean_dscrptns  = np.array([preprocess_text(description) for description in dscrptns])
vector_dscrptns  = tf.fit_transform(clean_dscrptns)
print('vector: ', vector_dscrptns .shape)
simil = linear_kernel(vector_dscrptns, vector_dscrptns)
print('similarity shape: ', simil.shape)

vector:  (9099, 212271)
similarity shape:  (9099, 9099)


Our content based recommerder will just identify the rowid of the movie title, find the corresponding row in the similarity matrix and select the indexes of the most similar movies, we just need to take care of the difference in Python and SQL indexing (Python starts with 0, SQL with 1):

In [14]:
def content_based_recommender(conn, title, num_recommendations = 10):
    cur = conn.cursor()
    sql = "SELECT rowid FROM smovies WHERE title = ?"
    cur.execute(sql, (title.lower(),))
    rows = cur.fetchall()
    cur.close()
    if len(rows) < 1:
        print(f'Movie "{title}" not in database."')
        return 
    movie_idx = int(rows[0][0]) - 1
    distance = sorted(list(enumerate(simil[movie_idx])), reverse = True, key = lambda vector_descriptions : vector_descriptions[1])
    recommendations = []
    for i in range(1,num_recommendations + 1):
        cur = conn.cursor()
        sql = "SELECT original_title FROM smovies WHERE rowid = ?"
        cur.execute(sql, (distance[i][0] + 1,))
        row = cur.fetchall()
        recommendations.append(row[0])
        cur.close()
    format_table(column_names = [f'Recommendations for "{title}"'], rows = recommendations, spacing = 40)

As a test, let's check with a favourite of mine:

In [15]:
content_based_recommender(conn, 'the matrix')

['    Recommendations for "the matrix"    ']
---------------------------------------------
['                Hackers                 ']
['                 Pulse                  ']
['            The Zero Theorem            ']
['            Electric Dreams             ']
['               Supernova                ']
['                 Avatar                 ']
['                Sneakers                ']
['          The Thirteenth Floor          ']
['   Lawnmower Man 2: Beyond Cyberspace   ']
['             The Animatrix              ']


In [16]:
?sorted