# Milestone 3: Text analysis


In [25]:
%matplotlib inline

import os
import sys
import nltk
import json
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from unidecode import unidecode
from nltk.corpus import stopwords
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

plt.style.use('ggplot')

## Grabbing the data

In [26]:
df = pd.read_json('../../data/themoviedb-5k-sample-4-15-2017.json')
print(df.shape)
df.head()

(5000, 29)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,cast,crew,genre_ids,genres,homepage,id,...,revenue,reviews,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,,,0,[],"[{u'name': u'Bobby Sheehan', u'department': u'...",[99],"[{u'name': u'Documentary', u'id': 99}]",https://www.boughtmovie.net,328380,...,0,[],93.0,"[{u'iso_639_1': u'en', u'name': u'English'}]",Released,,Bought,False,0.0,0
1,False,,,0,[],"[{u'name': u'Nancy Donnelly', u'department': u...",[99],"[{u'id': 99, u'name': u'Documentary'}]",,112052,...,0,[],60.0,"[{u'iso_639_1': u'en', u'name': u'English'}]",Released,,Egypt Underworld,False,3.8,2
2,False,/jWgtb7CRy3jWoXiz0IK352MG8b5.jpg,,0,"[{u'name': u'Zhang Jin', u'character': u'Chau ...","[{u'name': u'Douglas Kung Cheung-Tak', u'depar...",[28],"[{u'id': 28, u'name': u'Action'}]",http://www.hkcinemagic.com/en/movie.asp?id=6627,201706,...,0,[],1.0,"[{u'name': u'广州话 / 廣州話', u'iso_639_1': u'cn'},...",Released,,Chinese Heroes,False,0.0,0
3,False,,,0,"[{u'name': u'Erika Eleniak', u'character': u'J...","[{u'name': u'Brian Katkin', u'department': u'D...","[28, 18, 53]","[{u'id': 28, u'name': u'Action'}, {u'id': 18, ...",,61803,...,0,[],92.0,"[{u'name': u'English', u'iso_639_1': u'en'}]",Released,he quake of the century... get ready to rumble...,Shakedown,False,3.0,2
4,False,/sHH63HMAQsKLl0vpJ6AxulmkL92.jpg,,0,"[{u'name': u'Kay Panabaker', u'character': u'J...","[{u'name': u'Kevin Tancharoen', u'department':...","[10402, 35, 18, 10749]","[{u'id': 10402, u'name': u'Music'}, {u'id': 35...",,28665,...,0,[],107.0,"[{u'iso_639_1': u'en', u'name': u'English'}]",Released,,Fame,False,5.4,109


In [27]:
# Keeping only genres, overviews, and reviews
df = df[['genres', 'reviews', 'overview']].copy()
print(df.shape)
df.head()

(5000, 3)


Unnamed: 0,genres,reviews,overview
0,"[{u'name': u'Documentary', u'id': 99}]",[],Modern industrialization is no longer about st...
1,"[{u'id': 99, u'name': u'Documentary'}]",[],The documentary is an analysis of the fascinat...
2,"[{u'id': 28, u'name': u'Action'}]",[],A good-natured kung fu kid gets caught up in c...
3,"[{u'id': 28, u'name': u'Action'}, {u'id': 18, ...",[],In Los Angeles a deadly plague called the 'Pan...
4,"[{u'id': 10402, u'name': u'Music'}, {u'id': 35...",[],"An updated version of the 1980 musical, which ..."


## Converting `genres` to bigrams

In [28]:
all_bigrams_ever = []

def bigrams(row):
    return zip(row['genre_lists'], row['genre_lists'][1:])

def col_for_each_bigram(row):
    for bigram in row['genre_bigrams']:            
        all_bigrams_ever.append(tuple(sorted(bigram)))
    return row

def popular_bigram(row):
    for k in row['genre_bigrams']:
        if bigram_frequency[k] <= 50:
            row['popular_bigram'] = 'Other'
        elif k in OrderedDict(bigram_frequency.most_common()).keys() and row['popular_bigram'] != k:
            row['popular_bigram'] = k
    return row
                
df['genre_lists'] = df.apply(lambda x: [y['name'] for y in x['genres']], axis = 1)
df['genre_bigrams'] = df.apply(lambda x: bigrams(x), axis = 1)
df['popular_bigram'] = np.nan
df = df.apply(lambda x: col_for_each_bigram(x), axis = 1)
bigram_frequency = Counter(all_bigrams_ever)
df = df.apply(lambda x: popular_bigram(x), axis = 1)
df.head()

Unnamed: 0,genres,reviews,overview,genre_lists,genre_bigrams,popular_bigram
0,"[{u'name': u'Documentary', u'id': 99}]",[],Modern industrialization is no longer about st...,[Documentary],[],
1,"[{u'id': 99, u'name': u'Documentary'}]",[],The documentary is an analysis of the fascinat...,[Documentary],[],
2,"[{u'id': 28, u'name': u'Action'}]",[],A good-natured kung fu kid gets caught up in c...,[Action],[],
3,"[{u'id': 28, u'name': u'Action'}, {u'id': 18, ...",[],In Los Angeles a deadly plague called the 'Pan...,"[Action, Drama, Thriller]","[(Action, Drama), (Drama, Thriller)]","(Drama, Thriller)"
4,"[{u'id': 10402, u'name': u'Music'}, {u'id': 35...",[],"An updated version of the 1980 musical, which ...","[Music, Comedy, Drama, Romance]","[(Music, Comedy), (Comedy, Drama), (Drama, Rom...","(Drama, Romance)"


The final $y$ I'll be predicting is `popular_bigram`. 

## Converting `overview` and `reviews` to word-frequency vectors

In [29]:
# I realized that sometimes 'overview' was in my top words list; renaming it here 'protects' it
df.rename(columns={'overview': 'movie_overview'}, inplace=True)

stops = set(nltk.corpus.stopwords.words('english'))
all_words_ever = []

def add_cols_for_all_words(row):
    if row['movie_overview'] is not None:
        words = nltk.word_tokenize(row['movie_overview'])
        words = [word.lower() for word in words if len(word) > 2 and unidecode(word.lower()) not in stops]
        all_words_ever.extend(words)
    return row

def parse_words(row):
    if row['movie_overview'] is not None:
        words = nltk.word_tokenize(row['movie_overview'])
        words = [word.lower() for word in words if word.lower() in top_words]
        for word in words:
            row[unidecode(word)] += 1        
    return row

In [30]:
df['review_text'] = df.apply(lambda row: [x['content'] for x in row['reviews']], axis = 1)
df.head()

Unnamed: 0,genres,reviews,movie_overview,genre_lists,genre_bigrams,popular_bigram,review_text
0,"[{u'name': u'Documentary', u'id': 99}]",[],Modern industrialization is no longer about st...,[Documentary],[],,[]
1,"[{u'id': 99, u'name': u'Documentary'}]",[],The documentary is an analysis of the fascinat...,[Documentary],[],,[]
2,"[{u'id': 28, u'name': u'Action'}]",[],A good-natured kung fu kid gets caught up in c...,[Action],[],,[]
3,"[{u'id': 28, u'name': u'Action'}, {u'id': 18, ...",[],In Los Angeles a deadly plague called the 'Pan...,"[Action, Drama, Thriller]","[(Action, Drama), (Drama, Thriller)]","(Drama, Thriller)",[]
4,"[{u'id': 10402, u'name': u'Music'}, {u'id': 35...",[],"An updated version of the 1980 musical, which ...","[Music, Comedy, Drama, Romance]","[(Music, Comedy), (Comedy, Drama), (Drama, Rom...","(Drama, Romance)",[]


In [31]:
df = df.apply(lambda x: add_cols_for_all_words(x), axis = 1)
len(all_words_ever), len(set(all_words_ever))

(153604, 25820)

In [33]:
# I'll use the top N words
top_words = [word[0] for word in Counter(all_words_ever).most_common(50)]
len(top_words)

50

In [34]:
for word in top_words:
    df[unidecode(word)] = 0

df = df.apply(lambda x: parse_words(x), axis = 1)
    
print(df.shape)
df.head()

(5000, 57)


Unnamed: 0,genres,reviews,movie_overview,genre_lists,genre_bigrams,popular_bigram,review_text,one,life,film,...,make,also,help,together,wife,death,american,war,day,night
0,"[{u'name': u'Documentary', u'id': 99}]",[],Modern industrialization is no longer about st...,[Documentary],[],,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[{u'id': 99, u'name': u'Documentary'}]",[],The documentary is an analysis of the fascinat...,[Documentary],[],,[],0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,"[{u'id': 28, u'name': u'Action'}]",[],A good-natured kung fu kid gets caught up in c...,[Action],[],,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[{u'id': 28, u'name': u'Action'}, {u'id': 18, ...",[],In Los Angeles a deadly plague called the 'Pan...,"[Action, Drama, Thriller]","[(Action, Drama), (Drama, Thriller)]","(Drama, Thriller)",[],0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[{u'id': 10402, u'name': u'Music'}, {u'id': 35...",[],"An updated version of the 1980 musical, which ...","[Music, Comedy, Drama, Romance]","[(Music, Comedy), (Comedy, Drama), (Drama, Rom...","(Drama, Romance)",[],0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Fitting the model
I'll use a Naive Bayes multinomial classifier. We have to predict 28 unique genre bigrams. We're using 500 "top word" frequency vectors. 

In [63]:
X = df[top_words[:25]]
y = df['popular_bigram'].fillna(value = 'Other')
y = y.apply(lambda x: str(x))
X.shape, y.shape

((5000, 25), (5000,))

In [64]:
clf = MultinomialNB()

In [65]:
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: {:0.2f} (+/- {:0.2f})%".format(scores.mean(), scores.std() * 2))

Accuracy: 0.72 (+/- 0.01)%


I believe a random classifier would be correct 1/28 (3%) of the time, given that there are 28 unique labels we are trying to predict. As such, 72% is a lot better!

In [66]:
top_words[:25]

[u'one',
 u'life',
 u'film',
 u'new',
 u'world',
 u'young',
 u'two',
 u'man',
 u'story',
 u'family',
 u'love',
 u'years',
 u'time',
 u'find',
 u'woman',
 u'...',
 u'documentary',
 u'get',
 u'way',
 u'home',
 u'must',
 u'people',
 u'back',
 u'friends',
 u'first']