In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request
import json
import pickle

In [2]:
movies_df = pd.read_csv('movies.csv', usecols = ['movieId' , 'title'], dtype ={'movieId': 'int32', 'title': 'object'})
movies_df

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
62418,209157,We (2018)
62419,209159,Window of the Soul (2001)
62420,209163,Bad Poems (2018)
62421,209169,A Girl Thing (2001)


In [3]:
rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
rating_df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [4]:
data=pd.merge(rating_df,movies_df,on='movieId')
data

Unnamed: 0,userId,movieId,rating,title
0,1,296,5.0,Pulp Fiction (1994)
1,3,296,5.0,Pulp Fiction (1994)
2,4,296,4.0,Pulp Fiction (1994)
3,5,296,4.0,Pulp Fiction (1994)
4,7,296,4.0,Pulp Fiction (1994)
...,...,...,...,...
25000090,162358,200192,2.0,Den frusna leoparden (1986)
25000091,162358,200194,2.0,Tough Luck (2004)
25000092,162386,139970,3.5,I Don't Speak English (1995)
25000093,162386,200726,4.0,The Graduates (1995)


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
 3   title    object 
dtypes: float32(1), int32(2), object(1)
memory usage: 667.6+ MB


In [6]:
combine_movie_rating = data.dropna(axis=0 , subset = ['title'])
#group by title column to count the total number of ratings for a movie title
movie_ratingcount = (combine_movie_rating.
                    groupby(by = ['title'])['rating'].
                     count().
                     reset_index().
                     rename(columns = {'rating' : 'totalratingcount'})
                     [['title' , 'totalratingcount']]
                    )
movie_ratingcount

Unnamed: 0,title,totalratingcount
0,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",1
1,"""Great Performances"" Cats (1998)",179
2,#1 Cheerleader Camp (2010),9
3,#Captured (2017),2
4,#Female Pleasure (2018),3
...,...,...
58953,…And the Fifth Horseman Is Fear (1965),2
58954,キサラギ (2007),2
58955,チェブラーシカ (2010),9
58956,貞子3D (2012),12


In [7]:
#merging movie_rating_count with combine_mavie_rating to get user rating 
#for that movie title and total rating count for tHAT MOVIE TITLE
final_data_rating = combine_movie_rating.merge(movie_ratingcount, left_on = 'title', right_on = 'title', how = 'left')
#left_on indicates on which feature you are consider left df(i.e. combine_movie_rating)
#how indicates you will do the merge according to left dataset(i.e. combine_movie_rating), giving it the priority
final_data_rating

Unnamed: 0,userId,movieId,rating,title,totalratingcount
0,1,296,5.0,Pulp Fiction (1994),79672
1,3,296,5.0,Pulp Fiction (1994),79672
2,4,296,4.0,Pulp Fiction (1994),79672
3,5,296,4.0,Pulp Fiction (1994),79672
4,7,296,4.0,Pulp Fiction (1994),79672
...,...,...,...,...,...
25000090,162358,200192,2.0,Den frusna leoparden (1986),1
25000091,162358,200194,2.0,Tough Luck (2004),1
25000092,162386,139970,3.5,I Don't Speak English (1995),1
25000093,162386,200726,4.0,The Graduates (1995),1


In [9]:
final_data_rating.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000095 entries, 0 to 25000094
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   userId            int32  
 1   movieId           int32  
 2   rating            float32
 3   title             object 
 4   totalratingcount  int64  
dtypes: float32(1), int32(2), int64(1), object(1)
memory usage: 858.3+ MB


In [11]:
final_data_rating['totalratingcount'] = final_data_rating['totalratingcount'].astype('int32')

In [12]:
final_data_rating.describe()

Unnamed: 0,userId,movieId,rating,totalratingcount
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.185093,14925.36
std,46791.72,39198.86,1.094043,16439.34
min,1.0,1.0,0.5,1.0
25%,40510.0,1196.0,3.0,2986.0
50%,80914.0,2947.0,3.5,9152.0
75%,121557.0,8623.0,4.0,20757.0
max,162541.0,209171.0,5.0,81491.0


In [13]:
#taking out movies with ratings count
popularity_threshold = 1000
rating_popular_movie= final_data_rating.query('totalratingcount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalratingcount
0,1,296,5.0,Pulp Fiction (1994),79672
1,3,296,5.0,Pulp Fiction (1994),79672
2,4,296,4.0,Pulp Fiction (1994),79672
3,5,296,4.0,Pulp Fiction (1994),79672
4,7,296,4.0,Pulp Fiction (1994),79672


In [14]:
#seeing total number of unique titles
rating_popular_movie.title.value_counts().count()

3794

In [15]:
rating_popular_movie.title.value_counts()

Forrest Gump (1994)                 81491
Shawshank Redemption, The (1994)    81482
Pulp Fiction (1994)                 79672
Silence of the Lambs, The (1991)    74127
Matrix, The (1999)                  72674
                                    ...  
Pet Sematary II (1992)               1001
Stakeout (1987)                      1000
Darkest Hour (2017)                  1000
Black Mass (2015)                    1000
Farinelli: il castrato (1994)        1000
Name: title, Length: 3794, dtype: int64

In [16]:
#getting only movie names without year
rating_popular_movie['movie_name'] = rating_popular_movie['title'].str.extract(r'^(.*?)\s*\(\d{4}\)$')

MemoryError: 

In [None]:
rating_popular_movie

In [None]:
rating_popular_movie.drop(['title'] , axis=1, inplace=True)

In [None]:
rating_popular_movie['movie_name'] = rating_popular_movie['movie_name'].str.lower()
rating_popular_movie['movie_name_nospace'] = rating_popular_movie['movie_name'].str.replace(" ", "")
rating_popular_movie

In [None]:
#Saving it as csv file
rating_popular_movie.to_csv('rating_popular_movie.csv',index=False)