In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
movies = pd.read_csv('../../data/serendipity-sac2018/movies.csv', error_bad_lines=False)

b'Skipping line 19833: expected 8 fields, saw 10\nSkipping line 34143: expected 8 fields, saw 9\nSkipping line 36015: expected 8 fields, saw 10\nSkipping line 37260: expected 8 fields, saw 12\nSkipping line 44379: expected 8 fields, saw 10\nSkipping line 47551: expected 8 fields, saw 10\n'


In [3]:
movies.head()

Unnamed: 0,movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres
0,1,Toy Story (1995),1995-11-19,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",114709,862.0,"Adventure,Animation,Children,Comedy,Fantasy"
1,2,Jumanji (1995),1995-12-15,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",113497,8844.0,"Adventure,Children,Fantasy"
2,3,Grumpier Old Men (1995),1995-01-01,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",113228,15602.0,"Comedy,Romance"
3,4,Waiting to Exhale (1995),1996-01-15,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",114885,31357.0,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),1995-12-08,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",113041,11862.0,Comedy


In [4]:
len(movies["movieId"].unique()), len(movies["genres"].unique())

(49168, 2506)

In [5]:
movies['genres'] = movies['genres'].apply(lambda x: str(x).split(','))
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(movies['genres'])
movies = movies.join(pd.DataFrame(X, columns=mlb.classes_))

In [6]:
movies.head()

Unnamed: 0,movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres,Action,Adventure,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,nan
0,1,Toy Story (1995),1995-11-19,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",114709,862.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1995-12-15,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",113497,8844.0,"[Adventure, Children, Fantasy]",0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),1995-01-01,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",113228,15602.0,"[Comedy, Romance]",0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),1996-01-15,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",114885,31357.0,"[Comedy, Drama, Romance]",0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),1995-12-08,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",113041,11862.0,[Comedy],0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
filteredMovies = pd.DataFrame()
filterRatio = 0.30 ## Ratio of movies to retain
for genre in mlb.classes_:
    df = movies[movies[genre] == 1]
    df.reset_index(drop=True, inplace=True)
    indices = np.random.choice( df.index, size= int(filterRatio*len(df)), replace = False)
    filteredMovies = filteredMovies.append(df.iloc[indices])
filteredMovies.reset_index(inplace=True, drop=True)

#Drop duplicate movies
filteredMovies = filteredMovies.drop_duplicates('movieId') 

In [42]:
filteredMovies.shape[0]/ movies.shape[0]

0.45458428246013666

In [43]:
filteredMovies["genres"].value_counts()

[Drama]                                                 2125
[Comedy]                                                1346
[Documentary]                                           1073
[nan]                                                    993
[Comedy, Drama]                                          826
                                                        ... 
[Comedy, Animation, Action, Adventure, Children]           1
[Action, Animation, Mystery, Sci-Fi]                       1
[Action, Adventure, Comedy, Drama, Romance, Western]       1
[Drama, Sci-Fi, Adventure, Romance]                        1
[Romance, Fantasy, Thriller]                               1
Name: genres, Length: 2141, dtype: int64

In [33]:
filteredMovies.columns

Index(['movieId', 'title', 'releaseDate', 'directedBy', 'starring', 'imdbId',
       'tmdbId', 'genres', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western', 'nan'],
      dtype='object')

In [34]:
genres_lst = ['Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western']

In [35]:
filteredMovies[genres_lst].sum()

Action          5054
Adventure       2999
Animation       1855
Children        1972
Comedy          9851
Crime           3732
Documentary     2421
Drama          14811
Fantasy         2033
Film-Noir        320
Horror          3621
IMAX             177
Musical          886
Mystery         2058
Romance         5109
Sci-Fi          2473
Thriller        5803
War             1291
Western          714
dtype: int64

In [36]:
filteredMovies[genres_lst].sum()*100/movies[genres_lst].sum()

Action         82.893226
Adventure      85.029770
Animation      78.969774
Children       82.822344
Comedy         71.596773
Crime          83.827493
Documentary    54.985237
Drama          71.117833
Fantasy        85.456074
Film-Noir      87.912088
Horror         74.922408
IMAX           89.393939
Musical        82.037037
Mystery        85.678601
Romance        80.723653
Sci-Fi         81.617162
Thriller       81.514258
War            79.938080
Western        70.137525
dtype: float64

In [26]:
filteredMovies.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            45715, 45717, 45721, 45722, 45725, 45726, 45727, 45728, 45729,
            45730],
           dtype='int64', length=27716)

In [None]:
filteredMovies.to_csv('filtered_movies')