In [1]:

import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import datetime
import pickle

In [3]:
class MovieFilter(object):

    def __init__(self, movies, title_col='title'):
        self.movies = movies
        self.title_col = title_col

    def filter_string_length(self, length=60):
        title_lens = self.movies.apply(lambda x: len(x[self.title_col]), axis=1)
        mask = title_lens < length
        len0 = len(self.movies)
        self.movies = self.movies[mask]
        len1 = len(self.movies)
        self.print_filter_results('filter_string_length', len0, len1)

    def filter_english_words(self, num_allow=2):
        import string
        cwd = os.getcwd()
        words_en = set(line.strip() for line in open(os.path.join(cwd, "..", "data", "wordsEn.txt")))

        def check_if_english(row):
            title = row[self.title_col].lower()
            title = ''.join(c for c in title if c not in set(string.punctuation)).strip()
            title = ''.join(c for c in title if not c.isdigit()).strip()
            words = title.split(' ')
            count = 0
            for word in words:
                if word.strip() not in words_en:
                    count += 1
                    if count >= num_allow:
                        return False
            return True
        mask = self.movies.apply(check_if_english, axis=1)
        len0 = len(self.movies)
        self.movies = self.movies[mask]
        len1 = len(self.movies)
        self.print_filter_results('filter_english_words', len0, len1)

    def filter_release_year(self, min_year=1990):
        import re

        def get_release_year(row):
            title = row[self.title_col]
            year = re.search(r'\(\d{4}\)', title)
            if year:
                year = year.group(0)
            else:
                return None
            year = int(year.replace('(', '').replace(')', ''))
            return year
        release_year = self.movies.apply(get_release_year, axis=1)
        mask = release_year > min_year
        len0 = len(self.movies)
        self.movies = self.movies[mask]
        len1 = len(self.movies)
        self.print_filter_results('filter_release_year', len0, len1)

    def filter_rating_freq(self, freq, threshold=200, movieId_col='movieId'):
        red_freq = freq[freq >= threshold]
        red_freq = red_freq.index.tolist()
        mask = self.movies[movieId_col].isin(red_freq)
        len0 = len(self.movies)
        self.movies = self.movies[mask]
        len1 = len(self.movies)
        self.print_filter_results('filter_rating_freq', len0, len1)

    @staticmethod
    def print_filter_results(filter_name, len0, len1):
        print('{} filtered out {} movies. Num before: {}. Num after: {}'.format(filter_name, len0 - len1, len0, len1))

    def reduce_ratings_dataset(self, ratings, movieId_col='movieId'):
        mask = ratings[movieId_col].isin(self.movies[movieId_col])
        len0 = len(ratings)
        ratings = ratings[mask]
        len1 = len(ratings)
        print('Filtered out {} ratings. Num before: {}. Num after: {}'.format(len0 - len1, len0, len1))

In [5]:
path = 'C:\\Users\\jz3f19\\Downloads\\ml-20m\\ml-20m\\'

In [6]:
movies = pd.read_csv(path + 'movies.csv')

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings = pd.read_csv(path + 'ratings.csv')

In [9]:
frequences = ratings.groupby('movieId')['rating'].count()
frequences.head()

movieId
1    49695
2    22243
3    12735
4     2756
5    12161
Name: rating, dtype: int64

In [10]:
mf = MovieFilter(movies)

In [13]:
mf.filter_rating_freq(frequences, threshold=100)

filter_rating_freq filtered out 18732 movies. Num before: 27278. Num after: 8546


In [14]:
red_movies = mf.movies

In [16]:
red_rating_data = pd.merge(ratings, red_movies[['movieId']], on='movieId')

In [17]:
red_rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,5,2,3.0,851527569
2,13,2,3.0,849082742
3,29,2,3.0,835562174
4,34,2,3.0,846509384
...,...,...,...,...
19706276,130767,79590,2.5,1281482514
19706277,131900,79590,3.5,1322985471
19706278,133511,79590,4.0,1286208539
19706279,134510,79590,5.0,1315887279


In [19]:
red_rating_data.to_csv(path + 'ratings_filtered.csv', index=False)