In [1]:
import os
import urllib.request
import zipfile
import pandas as pd

# About the data
This dataset (ml-latest) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 22884377 ratings and 586994 tag applications across 34208 movies. These data were created by 247753 users between January 09, 1995 and January 29, 2016. This dataset was generated on January 29, 2016.

Source: https://grouplens.org/datasets/movielens/

### Download

In [2]:
DOWNLOAD_ROOT = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/"
DATASETS_PATH = os.path.join("Datasets")
FILENAME = "moviedataset.zip"
DOWNLOAD_URL = DOWNLOAD_ROOT + FILENAME

def fetch_data():
    if not os.path.isdir(DATASETS_PATH):
        os.makedirs(DATASETS_PATH)
    zip_path = os.path.join(DATASETS_PATH, FILENAME)
    urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
    with zipfile.ZipFile(zip_path,"r") as zip_ref:
        zip_ref.extractall(DATASETS_PATH)
        
def load_data():
    movies_path = os.path.join(DATASETS_PATH, "ml-latest", "movies.csv")
    ratings_path = os.path.join(DATASETS_PATH, "ml-latest", "ratings.csv")
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    return movies

In [3]:
# Uncomment to fetch the data:
# fetch_data()

In [4]:
movies_df = load_data()

### Quick review

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34208 entries, 0 to 34207
Data columns (total 3 columns):
movieId    34208 non-null int64
title      34208 non-null object
genres     34208 non-null object
dtypes: int64(1), object(2)
memory usage: 801.9+ KB


# Preprocessing

In [7]:
movies_df['year'] = movies_df.title.str.extract(r'(\(\d\d\d\d\))', expand=False)
movies_df['year'] = movies_df.year.str.extract(r'(\d\d\d\d)', expand=False)
movies_df['title'] = movies_df.title.str.replace(r'(\(\d\d\d\d\))', '').apply(lambda x: x.strip())
movies_df['genres'] = movies_df.genres.str.split('|')

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


Every genre should have its own column with 0 or 1 values if film belongs to that genre

In [8]:
moviesWithGenres_df = movies_df.copy()

for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1

moviesWithGenres_df.fillna(0, inplace=True)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Recommendation system

### User's input

Suppose the user has rated these movies:

In [9]:
userInput = [
            {'title':'Toy Story', 'rating':3.5}, # Max rating is 5.0
            {'title':'Jumanji', 'rating':2},
            {'title':'Howl\'s Moving Castle (Hauru no ugoku shiro)', 'rating':4.5},
            {'title':'Spirited Away (Sen to Chihiro no kamikakushi)', 'rating':5},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Toy Story,3.5
1,Jumanji,2.0
2,Howl's Moving Castle (Hauru no ugoku shiro),4.5
3,Spirited Away (Sen to Chihiro no kamikakushi),5.0
4,Pulp Fiction,5.0
5,Akira,4.5


In [10]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'])]

inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', axis=1).drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,5618,Spirited Away (Sen to Chihiro no kamikakushi),5.0
5,31658,Howl's Moving Castle (Hauru no ugoku shiro),4.5


In [11]:
movies_df[movies_df['title'].str.contains('Howl\'')]

Unnamed: 0,movieId,title,genres,year
9752,31658,Howl's Moving Castle (Hauru no ugoku shiro),"[Adventure, Animation, Fantasy, Romance]",2004


### User Pprofile

In [12]:
userGenreTable = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'])]
userGenreTable = userGenreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1).reset_index(drop=True)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
inputMovies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
5    4.5
Name: rating, dtype: float64

In [14]:
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])  # Product of genres and user ratings 
userProfile

Adventure             19.5
Animation             17.5
Children               5.5
Comedy                 8.5
Fantasy               15.0
Romance                4.5
Drama                  5.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [15]:
genreTable = moviesWithGenres_df.set_index('movieId')
genreTable = genreTable.drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Recommendation table

In [16]:
recommendations = (genreTable*userProfile).sum(axis=1) / userProfile.sum()

top_20 = recommendations.sort_values(ascending=False)[:20]
top_20

movieId
26093     0.798942
27344     0.788360
2987      0.751323
56152     0.746032
51939     0.746032
32031     0.746032
92348     0.746032
84637     0.746032
108932    0.746032
4306      0.746032
51632     0.746032
673       0.746032
26340     0.746032
130520    0.746032
62956     0.735450
52462     0.735450
47404     0.735450
27155     0.703704
136618    0.703704
6350      0.703704
dtype: float64

In [17]:
movies_df[movies_df['movieId'].isin(top_20.keys())]

Unnamed: 0,movieId,title,genres,year
664,673,Space Jam,"[Adventure, Animation, Children, Comedy, Fanta...",1996
2902,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
4212,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
6252,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,"[Action, Adventure, Animation, Children, Fanta...",1986
8605,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962
8783,26340,"Twelve Tasks of Asterix, The (Les douze travau...","[Action, Adventure, Animation, Children, Comed...",1976
9218,27155,"Batman/Superman Movie, The","[Action, Adventure, Animation, Children, Fanta...",1998
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
9825,32031,Robots,"[Adventure, Animation, Children, Comedy, Fanta...",2005
11230,47404,Mind Game,"[Adventure, Animation, Comedy, Fantasy, Romanc...",2004


# Results review

Recommended films look similar to what I've seen. 

Still, this basic algorithm doesn't guarantee good quality of recommended movies. Because it takes into account only genres.

How it can be improved:
- Count the year of movies. Animation in 2000 is different than animation in 1980.
- Add IMDb ratings to datasets and count it too.
- Add a country of production. Japanese anime and USA animation are slightly different.