In [None]:
import pandas as pd
import numpy as np
import json
import nltk
import spacy
import neuralcoref
import multiprocessing as mp

from collections import OrderedDict, defaultdict
from openie import StanfordOpenIE
from itertools import groupby
from fuzzywuzzy import fuzz
from gender_predictor.GenderClassifier import classify_gender
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm, notebook
from IPython.utils import io

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# movie_data_df = pd.read_csv('movie.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'movie_name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])
# character_df  = pd.read_csv('character.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'release_date','char_name', 'dob', 'gender', 'height', 'ethnicity', 'name', 'age', 'free_char_id1', 'free_char_id2', 'free_char_id3'])
# plot_df       = pd.read_csv('../MovieSummaries/plot_summaries.txt', sep='\t', skip_blank_lines=True, header=None, names=['id', 'plot'])

# character_df  = character_df[['id', 'char_name', 'gender']]

# movie_data_df['release_year'] = movie_data_df['release_date'].apply(lambda r:r[:4] if str(r)!='nan' else None)

# movie_id_by_year = {'United States of America':{}, 'India':{}, 'United Kingdom':{}}

# for index, row in movie_data_df.iterrows():
#     for key, value in json.loads(row['countries']).items():            
#         if value == 'United States of America' or value == 'India' or value == 'United Kingdom':
#             if row['release_year'] not in movie_id_by_year[value]:
#                 movie_id_by_year[value][row['release_year']] = [row.id]
#             else:
#                 movie_id_by_year[value][row['release_year']].append(row.id)
                


In [None]:
properties = {
    'openie.affinity_probability_cap': 1 / 3,
}

client = StanfordOpenIE(properties=properties)
nlp    = spacy.load('en_core_web_sm')
coref  = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    tokens_without_sw = []
    for (word, tag) in nltk.pos_tag(nltk.word_tokenize(text)):
        if tag == 'NNP' or word[0].isupper():
            continue
        elif word not in stopwords:
            tokens_without_sw.append(lemmatizer.lemmatize(word))
    return ' '.join(tokens_without_sw)

In [None]:
def coreference_movie_plot(nlp, plot):    
    doc = nlp(plot)
    return doc._.coref_resolved

def key_func(k):
    return k['subject']

def get_triplets(plot, client):
    sent_plot = plot.split(".")
    combined_list = []
    sent_dict = {}
    for i in sent_plot:
        #print(i)
        sent = []
        for triple in client.annotate(i):
            sent.append(triple)

        INFO = sorted(sent, key=key_func)


        comb_sent = []
        for key, value in groupby(INFO, key_func):
            if key in sent_dict:
                vals = sent_dict[key]
                vals.append(list(value)[0])
                sent_dict[key] = vals
            else:
                vals = []
                vals.append(list(value)[0])
                sent_dict[key] = vals
    return sent_dict
    
def get_event_chain(triplets):
    #TODO: Fix if neighboring events are same, this will repeat, remove them
    characters = defaultdict(set)
    for key, value in triplets.items():
        # Check if should use only relation or use object also
        events = [val['relation'] + " " + val['object'] for val in value]
        flag   = False
        for char in characters.keys():
            if fuzz.ratio(char, key) >= 50:
                characters[char].update(events)
                flag = True
        if not flag:
            characters[key].update(events)
            
    return characters

            
def get_movies_in_decade(country, zone="before"):
    if zone == 'before':
        count = [i for i in movie_id_by_year[country].keys() if i is not None and i<'2000']
    else:
        count = [i for i in movie_id_by_year[country].keys() if i is not None and i>='2012']

    decade_2000 = []
    for c in count:
        decade_2000 += movie_id_by_year[country][c]
    return decade_2000

def get_bigrams(event_chain):
    tokens = nltk.word_tokenize(event_chain)
    return list(nltk.bigrams(tokens))

In [None]:
female_words = ['she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'aunt', 'grandmother', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother']
male_words   = ['he', 'his', 'him', 'son', 'man', 'male', 'men', 'boys', 'gentleman', 'uncle', 'grandfather', 'gentlemen', 'boy', 'bloke', 'brother', 'father']
gender_neutral = ['they', 'them', 'it', 'theirs', 'i', 'you', 'we']

def get_frequency_for_movie(movie):
    
    #Get triplets using OpenIE on coreferenced movie plot
    coref_plot = coreference_movie_plot(nlp, movie)
    
    triplets = get_triplets(coref_plot, client)
    
    # Get event chain for each character
    event_chains = get_event_chain(triplets)
    frequency_list = {'M':list(), 'F':list()}
    gender_list    = {}
    for character, event_chain in event_chains.items():            
        try:
            character_name = ""
            
            # TRY 1 to get person - NER
            ner = nlp(character).ents
            if len(ner)>0:
                for entity in ner:
                    if entity.label_ == 'PERSON':
                        character_name = str(entity)

            # TRY 2 - POS TAGGING - Pronouns
            if character_name == "":
                # Verry buggy
                # Check difference between using other tools like spacy/pycorenlp
                tags = nltk.pos_tag(nltk.word_tokenize(character))

                # Find whether the sentence is associated with male or female
                # Check if we should directly choose one gender
                for (word, tag) in tags:
                    if 'NNP' in tag:
                        character_name = word.lower()
                    if 'PRP' in tag:
                        character_name = word.lower()
                    if word.lower() in female_words or word.lower() in male_words:
                        character_name = word.lower()
                    if word[0].isupper():
                        character_name = word

            
            # Don't process if character is not noun or pronoun
            if character_name == "":
                continue
            gender = None
            if character_name in gender_list:
                gender = gender_list[character_name]
            elif character_name in female_words:
                gender = 'F'
            elif character_name in male_words:
                gender = 'M'
            elif character in gender_neutral:
                    continue
            if gender is None:
                with io.capture_output() as captured:
                    gender = classify_gender(character_name)
             
            gender_list[character_name] = gender
            
            # Get unigrams
            unigrams = lemmatize_words(" ".join(event_chain))
            if len(unigrams)>0:
                frequency_list[gender].append(unigrams)
        except Exception as exc:
            print(exc)
            pass
    
    return frequency_list
    
def get_frequency_mapping(all_movie_plots, decade):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    pool = mp.Pool(5)
#     plots = []
#     for key in all_movie_plots.keys():
#         if key and key.isdigit() and int(key) >= decade and int(key) < decade+70:
#             plots.extend(all_movie_plots[key])
    output = {}
    for year in notebook.tqdm(all_movie_plots):
        results = [pool.apply_async(get_frequency_for_movie, args=(movie,)) for movie in all_movie_plots[year]]    
        movie_chain = [p.get() for p in notebook.tqdm(results, leave=False)]
        output[year] = movie_chain
        
    return output


def get_adjective_cloud(plots, decade=None):
    frequency_list = get_frequency_mapping(plots, decade)
    return frequency_list

In [None]:
with open("UK_plot_summaries_by_year.json") as f:
    movie_dict = json.load(f)

In [None]:
get_frequency_for_movie(movie_dict['2001'][1])

In [None]:
result = get_adjective_cloud(movie_dict)

In [None]:
with open("UK_event_chain_by_each_year.json", 'w') as f:
    json.dump(result, f)