# Get adjectives related to male and female

In [None]:
import numpy as np
from itertools import combinations, filterfalse
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import random
import sys
import os
import pickle

import pandas as pd
from pandas.core import indexing
from gender_predictor.GenderClassifier import classify_gender
from collections import defaultdict
from tqdm.notebook import tqdm
import multiprocessing as mp
import json

In [None]:
movie_data = pd.read_csv('movie.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'movie_name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])

In [None]:
character = pd.read_csv('character.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'release_date','char_name', 'dob', 'gender', 'height', 'ethnicity', 'name', 'age', 'free_char_id1', 'free_char_id2', 'free_char_id3'])
character = character[['id', 'char_name', 'gender']]

In [None]:
movie_data['release_year'] = movie_data['release_date'].apply(lambda r:r[:4] if str(r)!='nan' else None)

In [None]:
movie_id_by_year = {'United States of America':{}, 'India':{}}

for index, row in movie_data.iterrows():
    for key, value in json.loads(row['countries']).items():            
        if value == 'United States of America' or value == 'India':
            if row['release_year'] not in movie_id_by_year[value]:
                movie_id_by_year[value][row['release_year']] = [row.id]
            else:
                movie_id_by_year[value][row['release_year']].append(row.id)

In [None]:
count = [i for i in movie_id_by_year['India'].keys() if i is not None and i<'2000']

In [None]:
decade_2000 = []
for c in count:
    decade_2000 += movie_id_by_year['India'][c]

In [None]:
len(decade_2000)

In [None]:
female_words = ['she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'aunt', 'grandmother', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother']
male_words   = ['he', 'his', 'man', 'male', 'men', 'boys', 'gentleman', 'uncle', 'grandfather', 'gentlemen', 'boy', 'bloke', 'brother', 'father']
verbs        = ['VB', 'VBP', 'VBZ', 'VBN']
adj          = ['JJ']

def read_input_file(filename):
    data_df = pd.read_csv(filename,sep=',', skip_blank_lines=True, index_col= False)
    return data_df


def get_plots_by_movie_id(data_df):
    movie_ids = data_df.movie_id.unique() 
    grouped = data_df.groupby(data_df.movie_id)

    all_movie_plots = []
    for id in movie_ids:
        sents_df = grouped.get_group(id)
        all_movie_plots.append(sents_df)
    return all_movie_plots


def get_frequency_for_movie(movie, attribute_type):
    
    if movie.iloc[0]['movie_id'] not in decade_2000:
        return
    
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    name_data  = movie[((movie.dep_pos == 'NNP') & (movie.dep_ner == 'PERSON')) | (movie.dependent.isin(female_words)) | (movie.dependent.isin(male_words))]
    char_list  = character[character.id==movie.iloc[0]['movie_id']]
    if attribute_type == "verb":
        attribute_list = verb
    else:
        attribute_list = adj
    gender_list = {}
    for idx,name in name_data.iterrows():            
        try:
            character_name = name['dependent'].lower()
            gender = None
            if character_name in gender_list:
                gender = gender_list[character_name]
            elif character_name in female_words:
                gender = 'F'
            elif character_name in male_words:
                gender = 'M'
            else:
                for ix, char in char_list.iterrows():
                    chk = str(char['char_name'])
                    if character_name in chk.lower():
                        gender = char['gender']
                        break
                    
                if gender is None:
                    gender = classify_gender(character_name)
                    
            gender_list[character_name] = gender
            governor = int(name['governor'])
            governor_df = movie[(movie['sentence_id']==name['sentence_id']) & (movie['token_id'] == governor) & (movie['dep_pos'].isin(attribute_list))]
            df2 = movie[(movie['sentence_id']==name['sentence_id']) & (movie['governor'] == name['token_id'])  & (movie['dep_pos'].isin(attribute_list))]
            df3 = movie[(movie['sentence_id'] == name['sentence_id']) & (movie['governor']==name['governor']) & (movie['dep_pos'].isin(attribute_list))]
            y = pd.concat([governor_df, df2, df3]).drop_duplicates()
            for i, x in y.iterrows():
                frequency_list[gender][x['dependent']] +=1
        except Exception as exc:
            pass
    print(movie.iloc[0]['movie_id'])
    print(frequency_list)
    return frequency_list
    
def get_name_and_adjective_mapping(all_movie_plots, attribute_type):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    pool = mp.Pool(20)
    results = [pool.apply_async(get_frequency_for_movie, args=(movie,)) for movie in all_movie_plots]    
    output = [p.get() for p in results]
    return output


def get_adjective_cloud(filename, attribute_type="verb"):
    movie_data_df = read_input_file(filename)
    all_movie_plots = get_plots_by_movie_id(movie_data_df)
    name_adj_cluster_list = get_name_and_adjective_mapping(all_movie_plots, attribute_type)
    return name_adj_cluster_list



In [None]:
%time result = get_adjective_cloud('india_lemma.csv', "adj")

In [None]:
## Save frequency
import pickle
output = open('india_before_2000_verb.pkl', 'wb')
pickle.dump(frequency_list, output)
output.close()

In [None]:
# Load Frequency from file
import pickle
pkl_file = open('india_before_2000_verb.pkl', 'rb')

frequency_list = pickle.load(pkl_file)


In [None]:
# Combine frequency from different results
frequency_list = {'M':defaultdict(int),'F':defaultdict(int)}

for freq in result:
    if freq is None:
        continue
    for k, v in freq['M'].items():
        frequency_list['M'][k]+=v
    for k, v in freq['F'].items():
        frequency_list['F'][k]+=v  

# Calculate Odds Ratio

In [None]:
odds_ratio = {}
threshold  = 2
topk       = 50

total_num_f = sum(frequency_list['F'].values())
total_num_m = sum(frequency_list['M'].values())

for key in frequency_list['F'].keys():
    m_num = frequency_list['M'][key]
    f_num = frequency_list['F'][key]
    non_f_num = total_num_f - f_num
    non_m_num = total_num_m - m_num
    if f_num >= threshold and m_num >= threshold:
        # we only consider the events where there are at least {thresohld} occurences for both gender
        odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2)
    else:
        continue

In [None]:
from operator import itemgetter

top_m = dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk])
top_f = dict(sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

In [None]:
top_m

In [None]:
top_f

### Code to calculate Cosine Similarity

In [None]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
glove = KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
import numpy as np

words1 = ["she", "her", "woman", "women"]
words2 = ["he", "him", "his", "man", "men"]
words_list = list(top_m.keys()) + list(top_f.keys())

sim_m = []
sim_f = []
for word in words_list:
    res = []
    for w in words2:
        if w in glove.key_to_index and word in glove.key_to_index:
            res.append(cosine_similarity([glove[word]],[glove[w]])[0][0])
    if len(res)>0:
        mean = np.mean(res)
        if mean and mean>0:
            sim_m.append(mean)
    res = []     
    for w in words2:
        if w in glove.key_to_index and word in glove.key_to_index:
            res.append(cosine_similarity([glove[word]],[glove[w]])[0][0])
    if len(res)>0:
        mean = np.mean(res)
        if mean and mean>0:
            sim_f.append(mean)
            
print(sim_f)

print(sim_m)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
 
x_coordinates = sim_m[:10]
y_coordinates = sim_f[:10]

plt.figure(figsize=(8, 8), dpi=300)
sns.set(style='whitegrid')
sns.despine()
plt.scatter(x_coordinates, y_coordinates)

for i, x in enumerate(x_coordinates):
    plt.annotate(words_list[i], (x, y_coordinates[i]))
    
 
# naming the x axis
plt.xlabel('Male')
# naming the y axis
plt.ylabel('Female')
 
# giving a title to my graph
plt.title('Cosine Similarities')
 
# function to show the plot
plt.show()