# Milestone 2: Prepping the training data

In [53]:
%matplotlib inline

import os
import sys
import nltk
import numpy as np
import pandas as pd
from imdb import IMDb
from unidecode import unidecode
from nltk.corpus import stopwords
from collections import OrderedDict
from matplotlib import pyplot as plt
from IPython.display import Image, display

src_dir = os.path.join(os.getcwd(), os.pardir, os.pardir, 'src')
sys.path.append(src_dir)

# Importing the tmdb code from src/data
from data import tmdb

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('../../data/movie_metadata.csv')
print(df.shape)
df.head()

(5043, 28)


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## $y$: Just picking the top X genres
In order of frequency:
- Drama
- Comedy
- Thriller
- Action
- Romance
- Adventure
- Crime

Furthermore, Action/Thriller, Action/Adventure, Drama/Romance, Drama/Thriller, and a few others could be combined.

In [4]:
pop_genres = OrderedDict()
pop_genres[1] = 'Drama'
pop_genres[2] = 'Comedy'
pop_genres[3] = 'Thriller'
pop_genres[4] = 'Action'
pop_genres[5] = 'Romance'
pop_genres[6] = 'Adventure'
pop_genres[7] = 'Crime'
pop_genres

OrderedDict([(1, 'Drama'),
             (2, 'Comedy'),
             (3, 'Thriller'),
             (4, 'Action'),
             (5, 'Romance'),
             (6, 'Adventure'),
             (7, 'Crime')])

In [7]:
def most_popular_genre(row):
    genres = row['genres'].split('|')
    for genre in genres:
        for key, value in pop_genres.items():
            if genre in value:
                return value

df['top_genre'] = df.apply(lambda x: most_popular_genre(x), axis = 1)
df['top_genre'].fillna(value = 'Other', inplace = True)
df[['genres', 'top_genre']].head()

Unnamed: 0,genres,top_genre
0,Action|Adventure|Fantasy|Sci-Fi,Action
1,Action|Adventure|Fantasy,Action
2,Action|Adventure|Thriller,Action
3,Action|Thriller,Action
4,Documentary,Other


In [10]:
df['top_genre'].value_counts()

Comedy       1400
Drama        1163
Action       1153
Adventure     453
Crime         386
Other         265
Thriller      198
Romance        25
Name: top_genre, dtype: int64

## $X$: Combining TMDb data with Kaggle data
I think grabbing the TMDb summaries for all the movies will be very valuable. We can then use a naive Bayes classifier on these summaries to determine whether a movie is of a certain genre or not. (Here's the scikit-learn documentation for the method we could use: [`sklearn.naive_bayes.MultinomialNB`](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html))

In [37]:
def grab_tmdb_summary(row):
    results = tmdb.find_movie_by_name(row['movie_title'])
    if 'results' in results and len(results['results']) >= 1 \
        and 'overview' in results['results'][0]:
            return results['results'][0]['overview']

In [105]:
small = df.sample(n = 20)
small.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,top_genre
1051,Color,Francis Ford Coppola,36.0,123.0,0.0,886.0,Bob Hoskins,12000.0,25900000.0,Crime|Drama|Music,...,English,USA,R,58000000.0,1984.0,5000.0,6.5,1.85,828,Crime
1272,Color,Christian Duguay,110.0,116.0,18.0,249.0,Cary-Hiroyuki Tagawa,2000.0,30199105.0,Action|Adventure|Crime|Thriller,...,English,USA,R,40000000.0,2000.0,1000.0,5.7,2.35,470,Action
3640,Color,Peter Cattaneo,47.0,107.0,11.0,419.0,Olivia Williams,773.0,54606.0,Comedy|Crime|Romance,...,English,UK,PG-13,6000000.0,2001.0,766.0,6.2,2.35,30,Comedy
4937,Color,Bill Melendez,43.0,25.0,36.0,27.0,Bill Melendez,39.0,,Animation|Comedy|Family,...,English,USA,TV-G,150000.0,1965.0,36.0,8.4,1.33,0,Comedy
3456,Color,Vincent Paronnaud,242.0,89.0,10.0,106.0,Gena Rowlands,963.0,4443403.0,Animation|Biography|Drama|War,...,French,France,PG-13,7300000.0,2007.0,545.0,8.0,1.85,14000,Drama


In [106]:
small['summary'] = small.apply(lambda x: grab_tmdb_summary(x), axis = 1)
small[['movie_title', 'summary']].head()

Unnamed: 0,movie_title,summary
1051,The Cotton Club,The story of the people that frequented Harlem...
1272,The Art of War,When ruthless terrorists threaten to bring dow...
3640,Lucky Break,Half-way through his 12-year prison sentence f...
4937,A Charlie Brown Christmas,When Charlie Brown complains about the overwhe...
3456,Persepolis,"In 1970s Iran, Marjane 'Marji' Statrapi watche..."


In [107]:
stops = set(nltk.corpus.stopwords.words('english'))
all_words_ever = []

def add_cols_for_all_words(row):
    if row['summary'] is not None:
        words = nltk.word_tokenize(row['summary'])
        words = [word.lower() for word in words if len(word) > 1 and word not in stops]
        all_words_ever.extend(words)
    return row

def parse_words(row):
    words = nltk.word_tokenize(row['summary'])
    words = [word.lower() for word in words if len(word) > 1 and word not in stops]
    for word in words:
        row[unidecode(word)] += 1
    return row

In [108]:
small = small.apply(lambda x: add_cols_for_all_words(x), axis = 1)

for word in all_words_ever:
    small[unidecode(word)] = 0

small = small.apply(lambda x: parse_words(x), axis = 1)
    
print(small.shape)
small.head()

(20, 547)


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,called,sisterhood,quiet,suburban,town,backdrop,modern-day,salem,witch,trial
1051,Color,Francis Ford Coppola,36.0,123.0,0.0,886.0,Bob Hoskins,12000.0,25900000.0,Crime|Drama|Music,...,0,0,0,0,0,0,0,0,0,0
1272,Color,Christian Duguay,110.0,116.0,18.0,249.0,Cary-Hiroyuki Tagawa,2000.0,30199105.0,Action|Adventure|Crime|Thriller,...,0,0,0,0,0,0,0,0,0,0
3640,Color,Peter Cattaneo,47.0,107.0,11.0,419.0,Olivia Williams,773.0,54606.0,Comedy|Crime|Romance,...,0,0,0,0,0,0,0,0,0,0
4937,Color,Bill Melendez,43.0,25.0,36.0,27.0,Bill Melendez,39.0,,Animation|Comedy|Family,...,0,0,0,0,0,0,0,0,0,0
3456,Color,Vincent Paronnaud,242.0,89.0,10.0,106.0,Gena Rowlands,963.0,4443403.0,Animation|Biography|Drama|War,...,0,0,0,0,0,0,0,0,0,0
