# Data Exploration

## Importing the necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
links = pd.read_csv('data/links.csv')
tags = pd.read_csv('data/tags.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


After taking a look at the following datasets, I will be using the 'movies' and 'ratings' datasets in this notebook to build the recommendation system model.

In [8]:
# checking the unique number of user id and movie id in the ratings dataset
print(len(ratings['userId'].unique()), 'Number of user ids')
print(len(ratings['movieId'].unique()), 'Number of movie ids')

610 Number of user ids
9724 Number of movie ids


## merging the movies and ratings dataframes

I'll pivot the 'ratings' dataframe so that movie ids is the index and user ids make up the columns with the movie ratings as the values. I'll also replace the nan values with '0'

In [9]:
# merging movies and ratings dataframes and dropping the timestamp column
movie_ratings = pd.merge(movies, ratings).drop('timestamp', axis = 1)
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [55]:
movie_ratings.groupby('title')['rating'].mean().sort_values(ascending=False)[1000:]

title
Game of Death (1978)                             4.166667
High Sierra (1941)                               4.166667
Duellists, The (1977)                            4.166667
Dangerous Beauty (1998)                          4.166667
Sacrifice, The (Offret - Sacraficatio) (1986)    4.166667
                                                   ...   
The Beast of Hollow Mountain (1956)              0.500000
Follow Me, Boys! (1966)                          0.500000
The Butterfly Effect 3: Revelations (2009)       0.500000
The Emoji Movie (2017)                           0.500000
Rust and Bone (De rouille et d'os) (2012)        0.500000
Name: rating, Length: 8719, dtype: float64

In [40]:
movie_ratings.tail()

Unnamed: 0,movieId,title,genres,userId,rating
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint (2017),Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5
100835,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,331,4.0


In [59]:
movie_ratings.title = movie_ratings['title'].apply(lambda x:x[:-6])

In [60]:
movie_ratings

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint,Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,184,3.5
