In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
movies = pd.read_csv('Data/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [4]:
movies.describe()

Unnamed: 0,movieId
count,27278.0
mean,59855.48057
std,44429.314697
min,1.0
25%,6931.25
50%,68068.0
75%,100293.25
max,131262.0


In [5]:
movies.isna()

Unnamed: 0,movieId,title,genres
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
27273,False,False,False
27274,False,False,False
27275,False,False,False
27276,False,False,False


In [6]:
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [7]:
movies['genres'].value_counts()

genres
[Drama]                                                        4520
[Comedy]                                                       2294
[Documentary]                                                  1942
[Comedy, Drama]                                                1264
[Drama, Romance]                                               1075
                                                               ... 
[Action, Comedy, Crime, Western]                                  1
[Action, Fantasy, Sci-Fi, Thriller]                               1
[Action, Drama, Fantasy, Horror, Mystery, Sci-Fi, Thriller]       1
[Action, Drama, Fantasy, Romance]                                 1
[Animation, Children, Comedy, Western]                            1
Name: count, Length: 1342, dtype: int64

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

In [9]:
genres_df = mlb.fit_transform(movies['genres'])

In [10]:
mlb.classes_

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype=object)

In [11]:
genres_df = pd.DataFrame(genres_df, columns=mlb.classes_)
genres_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
genres_df.describe()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0
mean,0.009018,0.129042,0.08538,0.037649,0.041755,0.306987,0.107743,0.090586,0.489185,0.051763,0.012098,0.095718,0.007185,0.037979,0.055503,0.151294,0.063898,0.153164,0.043772,0.024782
std,0.094537,0.335252,0.279452,0.19035,0.200033,0.461253,0.310061,0.287024,0.499892,0.221553,0.109324,0.29421,0.084462,0.19115,0.228963,0.358342,0.244575,0.360152,0.20459,0.155463
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
genres_df[genres_df['(no genres listed)'] == 1].count()

(no genres listed)    246
Action                246
Adventure             246
Animation             246
Children              246
Comedy                246
Crime                 246
Documentary           246
Drama                 246
Fantasy               246
Film-Noir             246
Horror                246
IMAX                  246
Musical               246
Mystery               246
Romance               246
Sci-Fi                246
Thriller              246
War                   246
Western               246
dtype: int64

In [14]:
movies = pd.concat([movies, genres_df], axis=1)

In [15]:
movies.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
movies[movies['Action'] == 1]

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
5,6,Heat (1995),"[Action, Crime, Thriller]",0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,9,Sudden Death (1995),[Action],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]",0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14,15,Cutthroat Island (1995),"[Action, Adventure, Romance]",0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
19,20,Money Train (1995),"[Action, Comedy, Crime, Drama, Thriller]",0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27168,130842,Power/Rangers (2015),"[Action, Adventure, Sci-Fi]",0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
27187,130984,Santo vs. las lobas (1976),"[Action, Fantasy, Horror]",0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
27198,131025,The Brass Legend (1956),[Action],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27236,131122,Love Exposure (2007),"[Action, Comedy, Drama, Romance]",0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [17]:
ratings = pd.read_csv('Data/rating.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [18]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [19]:
movies.describe()

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,...,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0,27278.0
mean,59855.48057,0.009018,0.129042,0.08538,0.037649,0.041755,0.306987,0.107743,0.090586,0.489185,...,0.012098,0.095718,0.007185,0.037979,0.055503,0.151294,0.063898,0.153164,0.043772,0.024782
std,44429.314697,0.094537,0.335252,0.279452,0.19035,0.200033,0.461253,0.310061,0.287024,0.499892,...,0.109324,0.29421,0.084462,0.19115,0.228963,0.358342,0.244575,0.360152,0.20459,0.155463
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6931.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,68068.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,100293.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,131262.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
movies_and_ratings = pd.concat([movies, ratings], axis=1).reindex(movies.index)
movies_and_ratings.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,movieId.1,rating,timestamp
0,1.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,2,3.5,2005-04-02 23:53:47
1,2.0,Jumanji (1995),"[Adventure, Children, Fantasy]",0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,29,3.5,2005-04-02 23:31:16
2,3.0,Grumpier Old Men (1995),"[Comedy, Romance]",0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1,32,3.5,2005-04-02 23:33:39
3,4.0,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1,47,3.5,2005-04-02 23:32:07
4,5.0,Father of the Bride Part II (1995),[Comedy],0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,50,3.5,2005-04-02 23:29:40


In [29]:
movies_and_ratings[(movies_and_ratings['rating'] == 5.0) & (movies_and_ratings['Action'] == 1)]

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,movieId.1,rating,timestamp
158,160.0,Congo (1995),"[Action, Adventure, Mystery, Sci-Fi]",0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1,7153,5.0,2005-04-02 23:30:33
170,172.0,Johnny Mnemonic (1995),"[Action, Sci-Fi, Thriller]",0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,8507,5.0,2004-09-10 03:13:47
183,185.0,"Net, The (1995)","[Action, Crime, Thriller]",0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2,480,5.0,2000-11-21 15:32:00
196,198.0,Strange Days (1995),"[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]",0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,2,1327,5.0,2000-11-21 15:34:06
233,236.0,French Kiss (1995),"[Action, Comedy, Romance]",0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2,3930,5.0,2000-11-21 15:35:43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26356,126723.0,Kenny Begins (2009),"[Action, Comedy, Sci-Fi]",0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,208,2336,5.0,1999-11-21 21:03:03
26551,127264.0,The Hunchback of Paris (1959),"[Action, Adventure]",0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,208,3359,5.0,2000-07-22 13:35:34
27161,130768.0,Chain of Command (2000),"[Action, Thriller]",0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,209,246,5.0,2011-03-20 03:53:21
27168,130842.0,Power/Rangers (2015),"[Action, Adventure, Sci-Fi]",0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,209,628,5.0,2011-03-20 04:02:39
