# Data Loading

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
ratings = pd.read_csv("The movies dataset/ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
ratings.shape

(100004, 4)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [7]:
ratings['movieId'].value_counts()

356      341
296      324
318      311
593      304
260      291
        ... 
95473      1
60674      1
27922      1
1311       1
2047       1
Name: movieId, Length: 9066, dtype: int64

In [8]:
user_distrib = ratings['userId'].value_counts()
print(user_distrib)

547    2391
564    1868
624    1735
15     1700
73     1610
       ... 
444      20
438      20
583      20
249      20
399      20
Name: userId, Length: 671, dtype: int64


In [9]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [10]:
ratings_distrib = ratings['rating'].value_counts()
print(ratings_distrib)

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64


In [11]:
px.bar(ratings_distrib, x=ratings_distrib.index, y="rating")

In [12]:
ratings_per_user = ratings.groupby('userId').agg({'movieId': 'count', 'rating': 'mean'})
ratings_per_user = ratings_per_user.rename(columns={'movieId': 'movieId count', 'rating': 'average rating'}).sort_values("movieId count", ascending=False)
ratings_per_user.head()

Unnamed: 0_level_0,movieId count,average rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
547,2391,3.366792
564,1868,3.552463
624,1735,2.894236
15,1700,2.621765
73,1610,3.374224


In [13]:
px.bar(ratings_per_user, x=ratings_per_user.index, y='movieId count')

In [14]:
px.histogram(ratings_per_user, x='average rating')

In [15]:
rating_sorted_avg = ratings_per_user.sort_values('average rating')
display(rating_sorted_avg.head())
display(rating_sorted_avg.tail()) 

Unnamed: 0_level_0,movieId count,average rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
579,21,1.333333
581,49,1.459184
207,46,1.804348
609,140,1.985714
429,27,2.240741


Unnamed: 0_level_0,movieId count,average rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
622,31,4.725806
448,20,4.75
298,75,4.8
443,40,4.85
46,39,4.948718


## Remove all users with average ratings <2.5 and >4.5

In [16]:
easy_raters = rating_sorted_avg['average rating'] >4.5 
index = rating_sorted_avg.index
user_to_remove = list(index[easy_raters])

tough_raters = rating_sorted_avg['average rating'] <2.5 
user_to_remove.extend(index[tough_raters])
print(user_to_remove)

[40, 656, 287, 89, 446, 113, 622, 448, 298, 443, 46, 579, 581, 207, 609, 429, 133, 35, 315]


In [17]:
cleaned_df = ratings[~ratings['userId'].isin(user_to_remove)]
cleaned_df.shape

(98720, 4)

## Manage users with huge numbers of reviews

In [18]:
avg_nb_ratings = user_distrib.mean()
print('Average number of ratings per user', avg_nb_ratings)
x = 3
binge_watchers = user_distrib.index[user_distrib> x*avg_nb_ratings]
print(f'Number of Users who rated more than {x} time the average', len(binge_watchers))


Average number of ratings per user 149.03725782414307
Number of Users who rated more than 3 time the average 42


In [19]:
binge_watchers_ratings = cleaned_df[cleaned_df['userId'].isin(binge_watchers)]
normal_users_ratings = cleaned_df[~cleaned_df['userId'].isin(binge_watchers)]
print(binge_watchers_ratings.shape)
normal_users_ratings.shape

(35972, 4)


(62748, 4)

In [20]:
ratings_keep = binge_watchers_ratings.groupby("userId").sample(n=int(x*avg_nb_ratings))
ratings_keep.shape    

(18774, 4)

In [21]:
ratings_keep['userId'].value_counts()[:5]

48     447
457    447
119    447
311    447
56     447
Name: userId, dtype: int64

In [22]:
cleaned_df = pd.concat([normal_users_ratings, ratings_keep])
print(cleaned_df.shape)
cleaned_df.head()

(81522, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Checking that we don't have any of the previous behaviours

In [23]:
ratings_per_user = cleaned_df.groupby('userId').agg({'movieId': 'count', 'rating': 'mean'})
ratings_per_user = ratings_per_user.rename(columns={'movieId': 'movieId count', 'rating': 'average rating'}).sort_values("movieId count", ascending=False)
px.bar(ratings_per_user, x=ratings_per_user.index, y='movieId count')

In [24]:
px.histogram(ratings_per_user, x='average rating')

In [25]:
cleaned_df.to_csv('The movies dataset/cleaned_dataset.csv')

# Add Genre

In [26]:
movies_meta = pd.read_csv("The movies dataset/ml-latest/movies.csv")
movies_meta.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
ratings_with_name = pd.merge(cleaned_df, movies_meta, how='left', on='movieId')
ratings_with_name.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama


In [28]:
# use link dataset to get the proper movie ID
# OneHotEncoding the genres

ratings_with_name.isnull().value_counts()

userId  movieId  rating  timestamp  title  genres
False   False    False   False      False  False     81485
                                    True   True         37
dtype: int64

In [29]:
ratings_with_name.shape

(81522, 6)

## Check and remove duplicates

In [35]:
duplicates = movies_meta[movies_meta.duplicated(subset=['title'],keep=False)].sort_values('title')
# duplicates_unique = movies_meta[movies_meta.duplicated(subset=['title'], keep='first')].sort_values('title')
duplicates_unique = duplicates.drop_duplicates(subset=['title'])
duplicates_title = duplicates['title'].unique()
ratings_with_name_backup = ratings_with_name.copy(deep=True)

In [48]:
for title in duplicates_title:
    ratings_with_name[ratings_with_name['title']==title]['movieId'] = duplicates_unique[duplicates_unique['title'] == ratings_with_name[ratings_with_name['title']==title]]['movieId']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version.  Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version.  Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`



A value is trying to be set on a copy of a slice from a 

## Process Genres (OneHotEncoding)

In [30]:
genre = ratings_with_name['genres'].unique()

In [31]:
import numpy as np
genre_list = []
for g in genre:
    if isinstance(g, str):
        list_g = g.split('|')
        for s in list_g:
            if s not in genre_list:
                genre_list.append(s)

print(genre_list)

['Drama', 'Animation', 'Children', 'Musical', 'Thriller', 'Action', 'Adventure', 'Sci-Fi', 'War', 'Fantasy', 'Horror', 'Romance', 'Comedy', 'Crime', 'Western', 'Mystery', 'IMAX', 'Documentary', 'Film-Noir', '(no genres listed)']


In [50]:
for genre in genre_list:
    ratings_with_name[genre] = ratings_with_name.apply(lambda x: genre in str(x['genres']), axis=1)

In [51]:
ratings_with_name.head(6)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Animation,Children,Musical,...,Horror,Romance,Comedy,Crime,Western,Mystery,IMAX,Documentary,Film-Noir,(no genres listed)
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1,1263,2.0,1260759151,"Deer Hunter, The (1978)",Drama|War,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [53]:
ratings_with_name[ratings_with_name['(no genres listed)']]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Animation,Children,Musical,...,Horror,Romance,Comedy,Crime,Western,Mystery,IMAX,Documentary,Film-Noir,(no genres listed)
18406,200,136592,1.5,1438020227,Freaky Friday (1995),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
28189,299,83829,4.5,1344180332,Scorpio Rising (1964),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
30157,324,149532,3.0,1451519751,Marco Polo: One Hundred Eyes (2015),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
33927,371,122888,5.0,1473624419,Ben-hur (2016),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
53642,572,132952,4.0,1436466718,Sarfarosh (1999),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
61006,648,128616,4.0,1426357951,As We Were Dreaming (2015),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
61402,652,140753,4.0,1439587070,The Men Next Door (2012),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
61407,652,140763,5.0,1439587331,Boy Crazy (2009),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
64803,56,160590,5.0,1467095789,Survive and Advance (2013),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
77517,547,134025,3.0,1432654721,Open Secret (2013),(no genres listed),False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [52]:
ratings_with_name.to_csv('cleaned_movie_ratings.csv')

In [54]:
ratings_with_name

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Animation,Children,Musical,...,Horror,Romance,Comedy,Crime,Western,Mystery,IMAX,Documentary,Film-Noir,(no genres listed)
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81517,664,34405,4.0,1343732038,Serenity (2005),Action|Adventure|Sci-Fi,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
81518,664,71530,4.0,1393891216,Surrogates (2009),Action|Sci-Fi|Thriller,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
81519,664,80846,4.0,1344436046,Devil (2010),Horror|Mystery|Thriller,False,False,False,False,...,True,False,False,False,False,True,False,False,False,False
81520,664,8950,4.5,1343747045,The Machinist (2004),Drama|Mystery|Thriller,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [63]:
movies_meta[movies_meta['title']=='20,000 Leagues Under the Sea (1997)']

Unnamed: 0,movieId,title,genres
21121,102190,"20,000 Leagues Under the Sea (1997)",Adventure|Romance|Sci-Fi
24626,114130,"20,000 Leagues Under the Sea (1997)",Romance|Sci-Fi


In [68]:
ratings_with_name[ratings_with_name['title']=='Weekend (2011)']

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Animation,Children,Musical,...,Horror,Romance,Comedy,Crime,Western,Mystery,IMAX,Documentary,Film-Noir,(no genres listed)
8576,56,91199,4.0,1467092472,Weekend (2011),Drama|Romance,True,False,False,False,...,False,True,False,False,False,False,False,False,False,False
96184,652,91199,3.5,1439487912,Weekend (2011),Drama|Romance,True,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [80]:
# pd.set_option('display.max_rows', None) # Display full dataframe in console
# ratings[ratings['movieId'].isin(duplicate_id)]
ratings[ratings['movieId'].isin([114240, 588])]

Unnamed: 0,userId,movieId,rating,timestamp
88,2,588,3.0,835355441
106,3,588,3.0,1298922100
173,4,588,5.0,949949486
366,5,588,3.5,1163373551
528,7,588,4.0,851868044
1101,15,588,0.5,1093028161
3289,19,588,3.0,855195077
3551,20,588,3.5,1238729785
3816,22,588,2.0,1131662084
4072,23,588,4.0,1166728178


In [72]:
duplicate_id = duplicates['movieId'].unique()