In [1]:
import pandas as pd 
import numpy as np 
import random
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [2]:
data=pd.read_csv('movies.csv')

In [3]:
data.shape

(27278, 3)

In [4]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [8]:
data.describe()

Unnamed: 0,movieId
count,27278.0
mean,59855.48057
std,44429.314697
min,1.0
25%,6931.25
50%,68068.0
75%,100293.25
max,131262.0


In [9]:
data.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
movies=data['movieId'].unique().tolist()
len(movies)

27278

In [13]:
rating=pd.read_csv('ratings.csv')
rating.shape

(1048575, 4)

In [14]:
rating.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,1048575.0,1048575.0,1048575.0,1048575.0
mean,3527.086,8648.988,3.529272,1096036000.0
std,2018.424,19100.14,1.051919,159489900.0
min,1.0,1.0,0.5,825499900.0
25%,1813.0,903.0,3.0,965838200.0
50%,3540.0,2143.0,4.0,1099263000.0
75%,5233.0,4641.0,4.0,1217407000.0
max,7120.0,130642.0,5.0,1427764000.0


In [15]:
rating.rating.min()

0.5

In [16]:
rating.rating.max()

5.0

In [17]:
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [18]:
tags=pd.read_csv('tags.csv')
tags.shape

(465564, 4)

In [19]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,465564.0,465564.0,465564.0
mean,68712.354263,32627.76292,1298711000.0
std,41877.674053,36080.241157,79208910.0
min,18.0,1.0,1135429000.0
25%,28780.0,2571.0,1245007000.0
50%,70201.0,7373.0,1302291000.0
75%,107322.0,62235.0,1366218000.0
max,138472.0,131258.0,1427771000.0


In [20]:
tags.isna().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [21]:
tags=tags.dropna()

In [22]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [23]:
unique_tags=tags['tag'].unique().tolist()
len(unique_tags)

38643

In [24]:
drama_movies=data['genres'].str.contains('Drama')
data[drama_movies].head()

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
10,11,"American President, The (1995)",Comedy|Drama|Romance
13,14,Nixon (1995),Drama
15,16,Casino (1995),Crime|Drama
16,17,Sense and Sensibility (1995),Drama|Romance


In [25]:
drama_movies.shape

(27278,)

In [26]:
comedy_movies = data['genres'].str.contains('Comedy')
data[comedy_movies].head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
6,7,Sabrina (1995),Comedy|Romance


In [27]:
comedy_movies.shape

(27278,)

In [29]:
tag_search = tags['tag'].str.contains('dark')
tags[tag_search].head()

Unnamed: 0,userId,movieId,tag,timestamp
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
4,65,592,dark hero,1368150078
21,65,6874,dark hero,1368150079
51,121,778,dark comedy,1300852846


In [31]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [33]:
del rating['timestamp']

In [34]:
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [35]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [36]:
movie_ratings=data.merge(rating,on='movieId',how='inner')
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5


In [38]:
high_rated= movie_ratings['rating']>4.0
movie_ratings[high_rated].head(10)

Unnamed: 0,movieId,title,genres,userId,rating
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,5.0
14,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,34,5.0
15,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,39,5.0
19,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
20,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,59,4.5
24,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,82,5.0
25,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,84,5.0


In [39]:
low_rated = movie_ratings['rating']<4.0
movie_ratings[low_rated].head(10)

Unnamed: 0,movieId,title,genres,userId,rating
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,16,3.0
10,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,3.0
13,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,3.0
16,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,47,1.0
23,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,80,3.0
26,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,90,3.5
29,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,96,3.5
34,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,109,3.5
35,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114,3.0
36,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,115,1.5


In [40]:
unique_genre=data['genres'].unique().tolist()
len(unique_genre)

1342

In [42]:
most_rated = movie_ratings.groupby('title').size().sort_values(ascending=False)[:15]
most_rated.head(15)

title
Pulp Fiction (1994)                          3498
Forrest Gump (1994)                          3476
Silence of the Lambs, The (1991)             3247
Shawshank Redemption, The (1994)             3216
Jurassic Park (1993)                         3129
Star Wars: Episode IV - A New Hope (1977)    2874
Braveheart (1995)                            2799
Terminator 2: Judgment Day (1991)            2711
Matrix, The (1999)                           2705
Schindler's List (1993)                      2598
Toy Story (1995)                             2569
Fugitive, The (1993)                         2568
Independence Day (a.k.a. ID4) (1996)         2546
Apollo 13 (1995)                             2512
Usual Suspects, The (1995)                   2490
dtype: int64

In [43]:
data[['title','genres']].head()

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy


In [44]:
data['year'] =data['title'].str.extract('.*\((.*)\).*',expand = False)
data.head(5)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [45]:
def count_word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
        for s in liste_keywords: 
            if pd.notnull(s): keyword_count[s] += 1
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [46]:
genre_labels = set()
for s in data['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))