# Problem: IMDB Dataset 
### Problem class: Knowledge based recommender system
### Problem dataset link: https://bit.ly/33IAohl
### Problem description:
     Creating a knowledge based recommendation system using IMDB dataset.

### Problem Task:
     Recommend top 5 movies based on user defined criteria.

# Importing libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

# Load the dataset into a pandas dataframe

In [3]:
# """movies_metadata.csv: The main Movies Metadata file. 
# Contains information on 45,000 movies featured in the 
# Full MovieLens dataset. Features include posters, backdrops, 
# budget, revenue, release dates, languages, production countries 
# and companies.
# """

In [4]:
df = pd.read_csv("data/movies_metadata.csv")

df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
# Show all the fetaures or column of the dataset
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
# Only keep the feature that we need
df = df[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [7]:
# Convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Extract year from the datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [8]:
# Helper function to convert Nat to 0 and all other years to integers
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

# Apply convert_int to the year feature
df['year'] = df['year'].apply(convert_int)

In [9]:
# As we only need the year column so now we can drop the release_date drom the df
df = df.drop('release_date', axis=1)

# display the df
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [10]:
# Show genres of the first movie
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [11]:
# Import the literal_eval function from ast
from ast import literal_eval

# Define a Stringfied list and output it's type
a = "[1,2,3]"
print(type(a))

# Apply literakl_eval and out type
b = literal_eval(a)
type(b)

<class 'str'>


list

In [12]:
# Convert all Nan into stringified empty lists
df['genres'] = df['genres'].fillna('[]')

# Apply literal_eval to convert to the list object
df['genres'] = df['genres'].apply(literal_eval)

# Covert list of dictionaries to the list object
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

df.head()


Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [13]:
# We need create redundant data to show all genres individually
# Create a new fetaure by exploding genres
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)

# Name the new feature as "genre"
s.name = 'genre'

# Create a new df gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
gen_df = df.drop('genres', axis=1).join(s)

# Print the head of the new gen_df
gen_df.head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


In [14]:
gen_df.shape

(93548, 6)

In [15]:
# TODO: Ask the user for the genres of movies he/she is looking for
# TODO: Ask the user for the duration
# TODO: Ask the user for the timeline of the movies recommended
# TODO: Using the information collected, recommend movies to the user that have a high weighted rating (according to the IMDB formula) and that satisfy the preceding conditions

In [16]:
# Creating the recommender
def build_chart(gen_df: pd.DataFrame, percentile: float=0.8) -> pd.DataFrame:
    # Ask for preferred genres
    genre = input("Input preferred genre: ").title()
    
    # Ask for lower limit of duration
    low_time = int(input("Input shortest duration: "))
    
    # Ask for upper limit of duration
    high_time = int(input("Input longest duration: "))
    
    # Ask for lower limit of timeline
    low_year = int(input("Input earliest year: "))
    
    # Ask for upper limit of timeline
    high_year = int(input("Input latest year: "))
    
    # Define a new movies variable to store the preferred movies. copy the contents pf gen_df to movies
    movies = gen_df.copy()
    
    # Filter based on the condition
    movies = movies[(movies['genre'] == genre) &
                    (movies['runtime'] >= low_time) &
                    (movies['runtime'] <= high_time) &
                    (movies['year'] >= low_year) &
                    (movies['year'] <= high_year)
                   ]
    # Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)

    # Only consider movies that have higher than m votes. save those in a new data frame q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    q_movies
    # Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)
    
    # sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [None]:
# Generate the chart for top animation and display top 5.
build_chart(gen_df).head()

### Export cleaned data

In [17]:
# Convert the cleaned (non-exploded) dataframe df into a csv file and save it in the data folder.
# set parameter index to False as the index of the DataFrame has no inherent meaning.
df.to_csv("data/metadata_clean.csv", index=False)