# Weekend Movie Trip

Data from https://grouplens.org/datasets/movielens/ 

Using the MovieLens 20M Dataset data set

In [70]:
import pandas as pd
import matplotlib.pyplot as plt

### Load data

In [71]:
movies = pd.read_csv('../data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [72]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 152.3+ KB


In [73]:
ratings = pd.read_csv('../data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [74]:
len(ratings)

100836

In [75]:
tags = pd.read_csv("../data/tags.csv")       
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [76]:
len(tags["tag"].unique())

1589

### Merge movies and ratings

In [77]:
movieratings = pd.merge(movies, ratings, on=['movieId'])
movieratings.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


### Get average ratings of each movie

In [78]:
avg_ratings = movieratings.groupby('movieId')['rating'].mean().reset_index(name ='average rating')
avg_ratings.head(5)

Unnamed: 0,movieId,average rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


Merge movies to average ratings

In [79]:
movieratings = pd.merge(movieratings, avg_ratings,on=['movieId'])
movieratings.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,average rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,3.92093
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,3.92093
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,3.92093
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,3.92093
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,3.92093


Drop the unused columns

In [80]:
movieratings.drop(["userId","timestamp","rating","genres"],axis=1,inplace=True)
movieratings.head(5)

Unnamed: 0,movieId,title,average rating
0,1,Toy Story (1995),3.92093
1,1,Toy Story (1995),3.92093
2,1,Toy Story (1995),3.92093
3,1,Toy Story (1995),3.92093
4,1,Toy Story (1995),3.92093


Remove duplicates lines

In [81]:
movieratings = movieratings.drop_duplicates()
movieratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 100835
Data columns (total 3 columns):
movieId           9724 non-null int64
title             9724 non-null object
average rating    9724 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 265.9+ KB


In [82]:
movieratings.sort_values(by=['average rating'],inplace=True)
movieratings.head()

Unnamed: 0,movieId,title,average rating
78215,26696,Lionheart (1990),0.5
55750,3604,Gypsy (1962),0.5
74447,7312,"Follow Me, Boys! (1966)",0.5
99496,145724,Idaho Transfer (1973),0.5
90885,76030,Case 39 (2009),0.5


### Deal with genres

In [90]:
genres = movies['genres'].str.split('|').apply(pd.Series, 1).stack()

In [94]:
genres.head()

0  0    Adventure
   1    Animation
   2     Children
   3       Comedy
   4      Fantasy
dtype: object

In [101]:
genres.name = 'genres'
movies.drop(["genres"],axis=1,inplace = True)
moviegenres = movies.join(genres)

In [102]:
moviegenres.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


#### Number of movies in each genre

In [104]:
moviegenres["genres"].value_counts()

Drama                 4361
Comedy                3756
Thriller              1894
Action                1828
Romance               1596
Adventure             1263
Crime                 1199
Sci-Fi                 980
Horror                 978
Fantasy                779
Children               664
Animation              611
Mystery                573
Documentary            440
War                    382
Musical                334
Western                167
IMAX                   158
Film-Noir               87
(no genres listed)      34
Name: genres, dtype: int64

### Deal with tags

In [106]:
tags.drop(["userId","timestamp"],axis=1,inplace=True)
tags.head(3)

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell


In [None]:
moviegenres.groupby("genres").agg({"average rating":"sum"})