In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("../data/movie_metadata.csv")

In [3]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [4]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [5]:
drop_col = ['color', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'aspect_ratio', 
            'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes']
imdb = data.drop(drop_col, axis=1)

In [6]:
imdb.columns

Index(['director_name', 'num_critic_for_reviews', 'duration', 'actor_2_name',
       'gross', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'budget', 'title_year',
       'imdb_score', 'movie_facebook_likes'],
      dtype='object')

## Convert string to non-string

In [7]:
imdb.select_dtypes(exclude=['float', 'int']).columns.values

array(['director_name', 'actor_2_name', 'genres', 'actor_1_name',
       'movie_title', 'actor_3_name', 'language', 'country',
       'content_rating'], dtype=object)

In [18]:
imdb['movie_title'] = imdb['movie_title'].str.strip() # fix movie names

### For names

In [19]:
def convert_to_unique_id(col_name, new_col_name, fill_na_as='unknown'):
    global imdb
    # fill all Na values 
    imdb[col_name].fillna(fill_na_as, inplace=True)
    
    # get unique values and create an array with values as index
    elements = imdb[col_name].str.strip().unique()
    _ids = pd.Series(np.arange(len(elements)), index=elements)
    
    # map 
    imdb[new_col_name] = imdb[col_name].apply(lambda x: _ids[x])
    imdb.drop(col_name, axis=1, inplace=True)

In [20]:
imdb.head()

Unnamed: 0,director_name,num_critic_for_reviews,duration,actor_2_name,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes
0,James Cameron,723.0,178.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000
1,Gore Verbinski,302.0,169.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0
2,Sam Mendes,602.0,148.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000
3,Christopher Nolan,813.0,164.0,Christian Bale,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000
4,Doug Walker,,,Rob Walker,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,,,,,,7.1,0


In [21]:
# convert names to ids
str_cols = ['director_name', 'actor_2_name', 'actor_1_name', 'actor_3_name']
for col in str_cols:
    new_name = '_'.join(col.split('_')[:-1]) + '_id'
    convert_to_unique_id(col, new_name)

In [22]:
imdb.head()

Unnamed: 0,num_critic_for_reviews,duration,gross,genres,movie_title,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,director_id,actor_2_id,actor_1_id,actor_3_id
0,723.0,178.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,Avatar,886204,4834,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,0,0,0
1,302.0,169.0,309404152.0,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End,471220,48350,1238.0,English,USA,PG-13,300000000.0,2007.0,7.1,0,1,1,1,1
2,602.0,148.0,200074175.0,Action|Adventure|Thriller,Spectre,275868,11700,994.0,English,UK,PG-13,245000000.0,2015.0,6.8,85000,2,2,2,2
3,813.0,164.0,448130642.0,Action|Thriller,The Dark Knight Rises,1144337,106759,2701.0,English,USA,PG-13,250000000.0,2012.0,8.5,164000,3,3,3,3
4,,,,Documentary,Star Wars: Episode VII - The Force Awakens,8,143,,,,,,,7.1,0,4,4,4,4


In [23]:
imdb[imdb.director_id == 0]

Unnamed: 0,num_critic_for_reviews,duration,gross,genres,movie_title,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes,director_id,actor_2_id,actor_1_id,actor_3_id
0,723.0,178.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,Avatar,886204,4834,3054.0,English,USA,PG-13,237000000.0,2009.0,7.9,33000,0,0,0,0
26,315.0,194.0,658672302.0,Drama|Romance,Titanic,793059,45223,2528.0,English,USA,PG-13,200000000.0,1997.0,7.7,26000,0,23,20,23
288,210.0,153.0,204843350.0,Action|Sci-Fi,Terminator 2: Judgment Day,744891,2829,983.0,English,USA,R,102000000.0,1991.0,8.5,13000,0,224,155,251
291,94.0,141.0,146282411.0,Action|Comedy|Thriller,True Lies,190439,4528,351.0,English,USA,R,115000000.0,1994.0,7.2,0,0,226,157,253
606,82.0,171.0,54222000.0,Adventure|Drama|Sci-Fi|Thriller,The Abyss,131217,4074,380.0,English,USA,PG-13,69500000.0,1989.0,7.6,0,0,446,280,511
2486,250.0,154.0,85200000.0,Action|Adventure|Sci-Fi,Aliens,488537,4228,1076.0,English,USA,R,18500000.0,1986.0,8.4,18000,0,1511,280,1804
3575,204.0,107.0,38400000.0,Action|Sci-Fi,The Terminator,600266,3582,692.0,English,UK,R,6500000.0,1984.0,8.1,13000,0,588,280,1834


### country and language
All movies that have null as thier 'country' or 'language', are actually US movie. We fill them with accordingly

In [24]:
imdb[(imdb.country.isnull() | imdb.language.isnull())][['movie_title', 'country', 'language']]

Unnamed: 0,movie_title,country,language
4,Star Wars: Episode VII - The Force Awakens,,
279,"10,000 B.C.",,
2370,"Gone, Baby, Gone",,English
2459,Unforgettable,USA,
3086,September Dawn,USA,
3397,Preacher,,English
3539,Alpha and Omega 4: The Legend of the Saw Tooth...,USA,
3869,Silent Movie,USA,
4021,Dawn Patrol,,English
4110,Love's Abiding Joy,USA,


In [25]:
imdb.country.fillna('USA', inplace=True)
imdb.language.fillna('English', inplace=True)

### content rating

Movies - PG-13, PG, G, R, NC-17<br>
TV - TV-14, TV-PG, TV-MA, TV-G, TV-Y, TV-Y7<br>
others: Approved, Passed<br>
nulls - nan, Not Rated, Unrated, X<br>

map as:<br>

|new-rating|includes|
|----------|--------|
|G|G, TV-Y, TV-Y7, TV-G
|PG|PG, TV-PG, Approved, Passed, GP
|PG-13|PG-13, TV-14
|R|R, X
|NC-17|NC-17, TV-MA, M
|unrated| nan, Not Rated, Unrated
    

In [26]:
imdb.content_rating.unique()

array(['PG-13', nan, 'PG', 'G', 'R', 'TV-14', 'TV-PG', 'TV-MA', 'TV-G',
       'Not Rated', 'Unrated', 'Approved', 'TV-Y', 'NC-17', 'X', 'TV-Y7',
       'GP', 'Passed', 'M'], dtype=object)

In [27]:
imdb['content_rating'] = imdb.content_rating.replace({
    'TV-Y' : 'G',
    'TV-Y7' : 'G',
    'TV-G' : 'G',
    'TV-PG' : 'PG',
    'Approved' : 'PG',
    'Passed' : 'PG',
    'GP' : 'PG',
    'TV-14' : 'PG-13',
    'X' : 'R',
    'TV-MA' : 'NC-17',
    'M' : 'NC-17',
    'Not Rated' : 'Unrated'
})

In [28]:
imdb.content_rating.fillna('Unrated', inplace=True)

In [29]:
imdb.content_rating.unique()

array(['PG-13', 'Unrated', 'PG', 'G', 'R', 'NC-17'], dtype=object)

### genre

In [31]:
imdb.iloc[2]

num_critic_for_reviews                             602
duration                                           148
gross                                      2.00074e+08
genres                       Action|Adventure|Thriller
movie_title                                    Spectre
num_voted_users                                 275868
cast_total_facebook_likes                        11700
num_user_for_reviews                               994
language                                       English
country                                             UK
content_rating                                   PG-13
budget                                        2.45e+08
title_year                                        2015
imdb_score                                         6.8
movie_facebook_likes                             85000
director_id                                          2
actor_2_id                                           2
actor_1_id                                           2
actor_3_id

In [32]:
def genre_parser(row):
    genres = row.genres.split('|')
    for genre in genres:
        row.set_value(genre.strip(), 1)
    return row

In [33]:
genres = set()
imdb['genres'].apply(lambda x: genres.update(x.split('|')))

print('Total number of genres: {}'.format(len(genres)))
print('Genres are: ', [g for g in genres], sep=' ')

Total number of genres: 26
Genres are:  ['Short', 'News', 'Documentary', 'History', 'Biography', 'Game-Show', 'Musical', 'Animation', 'Mystery', 'Comedy', 'Music', 'Romance', 'Adventure', 'Fantasy', 'Drama', 'Thriller', 'Sport', 'Horror', 'Action', 'Reality-TV', 'Sci-Fi', 'Crime', 'War', 'Western', 'Family', 'Film-Noir']


We add new columns to the data frame with the column names are the genres. Then use binary encoding to specify the genres of the movie

In [34]:
for genre in genres:
    imdb[genre.strip()] = 0

In [35]:
imdb.columns

Index(['num_critic_for_reviews', 'duration', 'gross', 'genres', 'movie_title',
       'num_voted_users', 'cast_total_facebook_likes', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'budget', 'title_year',
       'imdb_score', 'movie_facebook_likes', 'director_id', 'actor_2_id',
       'actor_1_id', 'actor_3_id', 'Short', 'News', 'Documentary', 'History',
       'Biography', 'Game-Show', 'Musical', 'Animation', 'Mystery', 'Comedy',
       'Music', 'Romance', 'Adventure', 'Fantasy', 'Drama', 'Thriller',
       'Sport', 'Horror', 'Action', 'Reality-TV', 'Sci-Fi', 'Crime', 'War',
       'Western', 'Family', 'Film-Noir'],
      dtype='object')

In [36]:
imdb = imdb.apply(genre_parser, axis=1)

In [37]:
print(imdb.iloc[0].genres)
imdb.iloc[0][19:]

Action|Adventure|Fantasy|Sci-Fi


Short          0
News           0
Documentary    0
History        0
Biography      0
Game-Show      0
Musical        0
Animation      0
Mystery        0
Comedy         0
Music          0
Romance        0
Adventure      1
Fantasy        1
Drama          0
Thriller       0
Sport          0
Horror         0
Action         1
Reality-TV     0
Sci-Fi         1
Crime          0
War            0
Western        0
Family         0
Film-Noir      0
Name: 0, dtype: object

In [38]:
imdb = imdb.drop('genres', axis=1)

In [39]:
imdb.columns

Index(['num_critic_for_reviews', 'duration', 'gross', 'movie_title',
       'num_voted_users', 'cast_total_facebook_likes', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'budget', 'title_year',
       'imdb_score', 'movie_facebook_likes', 'director_id', 'actor_2_id',
       'actor_1_id', 'actor_3_id', 'Short', 'News', 'Documentary', 'History',
       'Biography', 'Game-Show', 'Musical', 'Animation', 'Mystery', 'Comedy',
       'Music', 'Romance', 'Adventure', 'Fantasy', 'Drama', 'Thriller',
       'Sport', 'Horror', 'Action', 'Reality-TV', 'Sci-Fi', 'Crime', 'War',
       'Western', 'Family', 'Film-Noir'],
      dtype='object')

## Fix numeric values

In [43]:
num_cols = imdb.select_dtypes(include=['float', 'int']).columns[:-30]
num_cols

Index(['num_critic_for_reviews', 'duration', 'gross', 'num_voted_users',
       'cast_total_facebook_likes', 'num_user_for_reviews', 'budget',
       'title_year', 'imdb_score', 'movie_facebook_likes'],
      dtype='object')

We will drop <br>
    1. 'gross',
    2. 'num_critic_for_reviews'
    3. 'num_voted_users'
    4. 'num_user_for_reviews'
    5. 'budget'
as these do not affect user liking/disliking a movie<br>

In [44]:
imdb.drop(
    ['gross', 'num_critic_for_reviews', 'num_voted_users', 'num_user_for_reviews', 'budget'],
    axis=1,
    inplace=True
)

In [47]:
imdb.columns

Index(['duration', 'movie_title', 'cast_total_facebook_likes', 'language',
       'country', 'content_rating', 'title_year', 'imdb_score',
       'movie_facebook_likes', 'director_id', 'actor_2_id', 'actor_1_id',
       'actor_3_id', 'Short', 'News', 'Documentary', 'History', 'Biography',
       'Game-Show', 'Musical', 'Animation', 'Mystery', 'Comedy', 'Music',
       'Romance', 'Adventure', 'Fantasy', 'Drama', 'Thriller', 'Sport',
       'Horror', 'Action', 'Reality-TV', 'Sci-Fi', 'Crime', 'War', 'Western',
       'Family', 'Film-Noir'],
      dtype='object')

In [48]:
imdb.isnull().sum()[:-30]

duration                      15
movie_title                    0
cast_total_facebook_likes      0
language                       0
country                        0
content_rating                 0
title_year                   108
imdb_score                     0
movie_facebook_likes           0
dtype: int64

### fix duration and title_year

In [49]:
imdb[imdb.duration.isnull()]['movie_title']

4          Star Wars: Episode VII - The Force Awakens
199     Harry Potter and the Deathly Hallows: Part II
206      Harry Potter and the Deathly Hallows: Part I
1510                              Black Water Transit
3604                                      War & Peace
3815                             Should've Been Romeo
3834                                            Barfi
4299                           Hum To Mohabbat Karega
4392                                         N-Secure
4397                              Dil Jo Bhi Kahey...
4517                                       Wolf Creek
4609                                Karachi se Lahore
4690                                          Destiny
4948                                 Romantic Schemer
4989                                    The Naked Ape
Name: movie_title, dtype: object

Fill with mean!

In [50]:
imdb.duration.fillna(imdb.duration.mean(), inplace=True)

IDK what to do with title year, fill with 0 for now!

In [51]:
imdb.title_year.fillna(0, inplace=True)

## Final checks!

In [52]:
imdb.isnull().sum().sum() 

0

In [54]:
imdb.select_dtypes(exclude=['float', 'int']).columns

Index(['movie_title', 'language', 'country', 'content_rating'], dtype='object')

In [55]:
# rename language, country, and content_rating with unique ids
for col in ['language', 'country', 'content_rating']:
    convert_to_unique_id(col_name=col, new_col_name=col+'_id')

In [56]:
imdb.select_dtypes(exclude=['float', 'int']).columns

Index(['movie_title'], dtype='object')

In [57]:
movies = imdb.movie_title
imdb.drop('movie_title', axis=1, inplace=True)

In [58]:
imdb.set_index(movies)

Unnamed: 0_level_0,duration,cast_total_facebook_likes,title_year,imdb_score,movie_facebook_likes,director_id,actor_2_id,actor_1_id,actor_3_id,Short,...,Reality-TV,Sci-Fi,Crime,War,Western,Family,Film-Noir,language_id,country_id,content_rating_id
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,178.000000,4834,2009.0,7.9,33000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,169.000000,48350,2007.0,7.1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Spectre,148.000000,11700,2015.0,6.8,85000,2,2,2,2,0,...,0,0,0,0,0,0,0,0,1,0
The Dark Knight Rises,164.000000,106759,2012.0,8.5,164000,3,3,3,3,0,...,0,0,0,0,0,0,0,0,0,0
Star Wars: Episode VII - The Force Awakens,107.201074,143,0.0,7.1,0,4,4,4,4,0,...,0,0,0,0,0,0,0,0,0,1
John Carter,132.000000,1873,2012.0,6.6,24000,5,5,5,5,0,...,0,1,0,0,0,0,0,0,0,0
Spider-Man 3,156.000000,46055,2007.0,6.2,0,6,6,6,6,0,...,0,0,0,0,0,0,0,0,0,0
Tangled,100.000000,2036,2010.0,7.8,29000,7,7,7,7,0,...,0,0,0,0,0,1,0,0,0,2
Avengers: Age of Ultron,141.000000,92000,2015.0,7.5,118000,8,8,8,8,0,...,0,1,0,0,0,0,0,0,0,0
Harry Potter and the Half-Blood Prince,153.000000,58753,2009.0,7.5,10000,9,9,9,9,0,...,0,0,0,0,0,1,0,0,1,2


Convert unnecessary float dtypes to int

In [59]:
imdb.select_dtypes(include=['float']).columns

Index(['duration', 'title_year', 'imdb_score'], dtype='object')

In [60]:
imdb['duration'] = imdb['duration'].astype('int16')
imdb['title_year'] = imdb['title_year'].astype('int16')

In [61]:
imdb.duration.dtype, imdb.title_year.dtype

(dtype('int16'), dtype('int16'))

In [62]:
imdb.head()

Unnamed: 0,duration,cast_total_facebook_likes,title_year,imdb_score,movie_facebook_likes,director_id,actor_2_id,actor_1_id,actor_3_id,Short,...,Reality-TV,Sci-Fi,Crime,War,Western,Family,Film-Noir,language_id,country_id,content_rating_id
0,178,4834,2009,7.9,33000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,169,48350,2007,7.1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,148,11700,2015,6.8,85000,2,2,2,2,0,...,0,0,0,0,0,0,0,0,1,0
3,164,106759,2012,8.5,164000,3,3,3,3,0,...,0,0,0,0,0,0,0,0,0,0
4,107,143,0,7.1,0,4,4,4,4,0,...,0,0,0,0,0,0,0,0,0,1


## Save dataset for later use

In [102]:
imdb['movie_title'] = movies

In [103]:
imdb.to_csv('../data/imdb_dataset.csv', index=False)

## Helper to load data

In [104]:
def load_imdb(path):
    data = pd.read_csv(path)
    movies = data['movie_title']
    data.drop('movie_title', axis=1, inplace=True)
    data.set_index(movies, inplace=True)
    return movies, data

In [105]:
movies, imdb = load_imdb('../data/imdb_dataset.csv')

In [106]:
imdb.head()

Unnamed: 0_level_0,duration,cast_total_facebook_likes,title_year,imdb_score,movie_facebook_likes,director_id,actor_2_id,actor_1_id,actor_3_id,Action,...,Horror,Family,Mystery,Biography,Crime,History,Romance,language_id,country_id,content_rating_id
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,178,4834,2009,7.9,33000,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,169,48350,2007,7.1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Spectre,148,11700,2015,6.8,85000,2,2,2,2,1,...,0,0,0,0,0,0,0,0,1,0
The Dark Knight Rises,164,106759,2012,8.5,164000,3,3,3,3,1,...,0,0,0,0,0,0,0,0,0,0
Star Wars: Episode VII - The Force Awakens,107,143,0,7.1,0,4,4,4,4,0,...,0,0,0,0,0,0,0,0,0,1
