In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import spacy
import neuralcoref
import multiprocessing as mp

from collections import OrderedDict, defaultdict
from openie import StanfordOpenIE
from itertools import groupby
from fuzzywuzzy import fuzz
from gender_predictor.GenderClassifier import classify_gender

In [2]:
movie_data_df = pd.read_csv('movie.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'movie_name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])
character_df  = pd.read_csv('character.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'release_date','char_name', 'dob', 'gender', 'height', 'ethnicity', 'name', 'age', 'free_char_id1', 'free_char_id2', 'free_char_id3'])
plot_df       = pd.read_csv('../MovieSummaries/plot_summaries.txt', sep='\t', skip_blank_lines=True, header=None, names=['id', 'plot'])

character_df  = character_df[['id', 'char_name', 'gender']]

movie_data_df['release_year'] = movie_data_df['release_date'].apply(lambda r:r[:4] if str(r)!='nan' else None)

movie_id_by_year = {'United States of America':{}, 'India':{}, 'United Kingdom':{}}

for index, row in movie_data_df.iterrows():
    for key, value in json.loads(row['countries']).items():            
        if value == 'United States of America' or value == 'India' or value == 'United Kingdom':
            if row['release_year'] not in movie_id_by_year[value]:
                movie_id_by_year[value][row['release_year']] = [row.id]
            else:
                movie_id_by_year[value][row['release_year']].append(row.id)
                


In [6]:
properties = {
    'openie.affinity_probability_cap': 1 / 3,
}

client = StanfordOpenIE(properties=properties)
nlp    = spacy.load('en_core_web_sm')
coref  = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [11]:
def coreference_movie_plot(nlp, plot):    
    doc = nlp(plot)
    return doc._.coref_resolved

def key_func(k):
    return k['subject']

def get_triplets(plot, client):
    sent_plot = plot.split(".")
    combined_list = []
    sent_dict = {}
    for i in sent_plot:
        #print(i)
        sent = []
        for triple in client.annotate(i):
            sent.append(triple)

        INFO = sorted(sent, key=key_func)


        comb_sent = []
        for key, value in groupby(INFO, key_func):
            if key in sent_dict:
                vals = sent_dict[key]
                vals.append(list(value)[0])
                sent_dict[key] = vals
            else:
                vals = []
                vals.append(list(value)[0])
                sent_dict[key] = vals
    return sent_dict
    
def get_event_chain(triplets):
    #TODO: Fix if neighboring events are same, this will repeat, remove them
    characters = defaultdict(set)
    for key, value in triplets.items():
        events = [val['relation'] for val in value]
        flag   = False
        for char in characters.keys():
            if fuzz.ratio(char, key) >= 50:
                characters[char].update(events)
                flag = True
        if not flag:
            characters[key].update(events)
            
    return characters

            
def get_movies_in_decade(country, zone="before"):
    if zone == 'before':
        count = [i for i in movie_id_by_year[country].keys() if i is not None and i<'2000']
    else:
        count = [i for i in movie_id_by_year[country].keys() if i is not None and i>='2000']

    decade_2000 = []
    for c in count:
        decade_2000 += movie_id_by_year[country][c]
    return decade_2000

def get_bigrams(event_chain):
    tokens = nltk.word_tokenize(event_chain)
    return list(nltk.bigrams(tokens))

In [9]:
female_words = ['she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'aunt', 'grandmother', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother']
male_words   = ['he', 'his', 'him', 'man', 'male', 'men', 'boys', 'gentleman', 'uncle', 'grandfather', 'gentlemen', 'boy', 'bloke', 'brother', 'father']


def get_frequency_for_movie(movie, decade_2000):
    #if movie['id'] not in decade_2000:
    #    return
    
    #Get triplets using OpenIE on coreferenced movie plot
    print(movie['plot'], "\n\n")
    coref_plot = coreference_movie_plot(nlp, movie['plot'])
    print(coref_plot, "\n\n")
    
    triplets = get_triplets(coref_plot, client)
    print(triplets, "\n\n")
    
    # Get event chain for each character
    event_chains = get_event_chain(triplets)
    print(event_chains, "\n\n")
    
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    char_list      = character_df[character_df['id']==movie['id']]
    gender_list    = {}
    for character, event_chain in event_chains.items():            
        try:
          
            # Only take proper nouns
            tags = nltk.pos_tag(nltk.word_tokenize(character))
            # Choose the last name,pronoun - Assumption
            character_name = ""
            for (char, tag) in tags:
                if 'NN' in tag:
                    character_name = char.lower()
                elif 'PRP' in tag:
                    character_name = char.lower()
                elif char in female_words or char in male_words:
                    character_name = char.lower()
            
            # Don't process if character is not noun or pronoun
            if character_name == "":
                continue
            
            gender = None
            if character_name in gender_list:
                gender = gender_list[character_name]
            elif character_name in female_words:
                gender = 'F'
            elif character_name in male_words:
                gender = 'M'
            else:
                for ix, char in char_list.iterrows():
                    chk = char['char_name']
                    if chk and fuzz.ratio(str(chk), character_name) >= 50:
                        gender = char['gender']
                        break
            if gender is None:
                    gender = classify_gender(character_name)
    
            print(character_name, gender)
                    
            gender_list[character_name] = gender
            
            # Get bigrams
            bigrams = get_bigrams(" ".join(event_chain))
            print(bigrams)
                
            for bigram in bigrams:
                search = " ".join(bigram)
                flag = False
                for key in frequency_list[gender]:
                    if fuzz.ratio(search, key) >=75:
                        frequency_list[gender][key]+=1   
                        flag = True
                if not flag:
                    frequency_list[gender][search]+=1 
            
        except Exception as exc:
            print(exc)
            pass
    print(movie['plot'])
    print(frequency_list)
    return frequency_list
    
def get_frequency_mapping(all_movie_plots, country, zone):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    decade_2000 = get_movies_in_decade(country, zone)
    #pool = mp.Pool(1)
    #results = [pool.apply_async(get_frequency_for_movie, args=(movie, decade_2000)) for idx, movie in all_movie_plots.iterrows()]    
    #output = [p.get() for p in results]
    for idx, movie in all_movie_plots.iterrows():
        result = get_frequency_for_movie(movie, decade_2000)
    return


def get_adjective_cloud(plots, country, zone='before'):
    frequency_list = get_frequency_mapping(plots, country, zone)
    return frequency_list

In [21]:
get_adjective_cloud(plot_df, 'India', 'before')

Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all. 


Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite Shlykov, a hard-working taxi driver and Lyosha, a saxophonist prejudices, realize Shlykov, a hard-working taxi driver and Lyosha, a saxophonist aren't so different after all. 


{'Lyosha': [{'subject': 'Lyosha', 'relation': 'realize despite', 'object': 'Shlykov'}], 'Shlykov': [{'subject': 'Shlykov', 'relation': 'develop', 'object': 'love hate relationship'}], 'saxophonist prejudices': [{'subject': 'saxophonist prejudices', 'relation': 'realize despite', 'object': 'Shlykov'}], 'taxi driver': [{'subject': 'taxi driver', 'relation': 'realize despite', 'object': 'Shlykov'}], 'working taxi driver': [{'subject': 'working taxi driver', 'relation': 'realize despite', 'object': 'Shlykov'}]} 


default

KeyboardInterrupt: 

In [12]:
get_frequency_for_movie(plot_df.iloc[41], [])

In the desert wilderness of 1930s Manchuria, The Bad  - a bandit and hitman - is hired to acquire a treasure map from a Japanese official traveling by train. Before he can get it however, The Weird  - a thief  - steals the map and is caught up in The Bad's derailment of the train. This involves the slaughter of the Japanese and Manchurian guards, and various civilians. The Good  - an eagle-eyed bounty hunter - appears on the scene to claim the bounty on The Bad. Meanwhile The Weird escapes, eluding his Good and Bad pursuers. A third force - a group of Manchurian bandits - also want the map to sell to the Ghost Market. The Weird hopes to uncover the map's secrets and recover what he believes is gold and riches buried by the Qing Dynasty just before the collapse of their government. As the story continues, an escalating battle for the map occurs, with bounties placed on heads and the Imperial Japanese Army racing to reclaim its map as it can apparently "save the Japanese Empire". After a

{'M': defaultdict(int,
             {'recognizes in': 1,
              'begins With': 1,
              'With erupts': 1,
              'erupts with': 1,
              'with flees': 1,
              'flees across': 1,
              'across lay': 1,
              'lay as': 1,
              'as can': 1,
              'can get': 1,
              'sell to': 1,
              'does survive': 1,
              'survive along': 1,
              'along make': 1,
              'make sets': 1,
              'sets off': 1,
              'off buried': 1,
              'buried by': 1,
              'kills can': 1,
              'can save': 1}),
 'F': defaultdict(int, {'is in': 1, 'begins With': 1})}