# Get adjectives related to male and female

In [1]:
import numpy as np
from itertools import combinations, filterfalse
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import random
import sys
import os
import pickle

import pandas as pd
from pandas.core import indexing
from gender_predictor.GenderClassifier import classify_gender
from collections import defaultdict
from tqdm.notebook import tqdm
import multiprocessing as mp
import json

In [2]:
movie_data = pd.read_csv('movie.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'movie_name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres'])

In [3]:
character = pd.read_csv('character.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'release_date','char_name', 'dob', 'gender', 'height', 'ethnicity', 'name', 'age', 'free_char_id1', 'free_char_id2', 'free_char_id3'])
character = character[['id', 'char_name', 'gender']]

In [4]:
movie_data['release_year'] = movie_data['release_date'].apply(lambda r:r[:4] if str(r)!='nan' else None)

In [5]:
movie_id_by_year = {'United States of America':{}, 'India':{}}

for index, row in movie_data.iterrows():
    for key, value in json.loads(row['countries']).items():            
        if value == 'United States of America' or value == 'India':
            if row['release_year'] not in movie_id_by_year[value]:
                movie_id_by_year[value][row['release_year']] = [row.id]
            else:
                movie_id_by_year[value][row['release_year']].append(row.id)

In [23]:
count = [i for i in movie_id_by_year['India'].keys() if i is not None and i <'2000' and i>='1990']

In [24]:
decade_1990 = []
for c in count:
    decade_1990 += movie_id_by_year['India'][c]

In [25]:
len(decade_1990)

1235

In [26]:
female_words = ['she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother']
male_words   = ['he', 'his', 'man', 'male', 'men', 'boys', 'gentleman', 'gentlemen', 'boy', 'bloke', 'brother', 'father']


def read_input_file(filename):
    data_df = pd.read_csv(filename,sep=',', skip_blank_lines=True, index_col= False)
    return data_df


def get_plots_by_movie_id(data_df):
    movie_ids = data_df.movie_id.unique() 
    grouped = data_df.groupby(data_df.movie_id)

    all_movie_plots = []
    for id in movie_ids:
        sents_df = grouped.get_group(id)
        all_movie_plots.append(sents_df)
    return all_movie_plots


def get_frequency_for_movie(movie):
    
    if movie.iloc[0]['movie_id'] in decade_1990:
        return
    
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    name_data  = movie[((movie.dep_pos == 'NNP') & (movie.dep_ner == 'PERSON')) | (movie.dependent.isin(female_words)) | (movie.dependent.isin(male_words))]
    char_list  = character[character.id==movie.iloc[0]['movie_id']]
    
    gender_list = {}
    for idx,name in name_data.iterrows():            
        try:
            character_name = name['dependent'].lower()
            gender = None
            if character_name in gender_list:
                gender = gender_list[character_name]
            elif character_name in female_words:
                gender = 'F'
            elif character_name in male_words:
                gender = 'M'
            else:
                for ix, char in char_list.iterrows():
                    chk = str(char['char_name'])
                    if character_name in chk.lower():
                        gender = char['gender']
                        break
                    
                if gender is None:
                    gender = classify_gender(character_name)
                    
            gender_list[character_name] = gender
            governor = int(name['governor'])
            governor_df = movie[(movie['sentence_id']==name['sentence_id']) & (movie['token_id'] == governor) & (movie['dep_pos'].isin(['JJ', 'VB', 'VBP', 'VBZ', 'VBN']))]
            df2 = movie[(movie['sentence_id']==name['sentence_id']) & (movie['governor'] == name['token_id'])  & (movie['dep_pos'].isin(['JJ', 'VB', 'VBP', 'VBZ', 'VBN']))]
            df3 = movie[(movie['sentence_id'] == name['sentence_id']) & (movie['governor']==name['governor']) & (movie['dep_pos'].isin(['JJ', 'VB', 'VBP', 'VBZ', 'VBN']))]
            y = pd.concat([governor_df, df2, df3]).drop_duplicates()
            for i, x in y.iterrows():
                frequency_list[gender][x['dependent']] +=1
        except Exception as exc:
            pass
    print(movie.iloc[0]['movie_id'])
    print(frequency_list)
    return frequency_list
    
def get_name_and_adjective_mapping(all_movie_plots):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    pool = mp.Pool(20)
    results = [pool.apply_async(get_frequency_for_movie, args=(movie,)) for movie in all_movie_plots]    
    output = [p.get() for p in results]
    return output


def get_adjective_cloud(filename):
    movie_data_df = read_input_file(filename)
    all_movie_plots = get_plots_by_movie_id(movie_data_df)
    name_adj_cluster_list = get_name_and_adjective_mapping(all_movie_plots)
    return name_adj_cluster_list



In [None]:
%time result = get_adjective_cloud('india_lemma.csv')

20553214
{'M': defaultdict(<class 'int'>, {'steal': 2}), 'F': defaultdict(<class 'int'>, {'offer': 1, 'aid': 1, 'make': 1})}
Accuracy: 0.969757
Accuracy: 0.970225
26623942
{'M': defaultdict(<class 'int'>, {}), 'F': defaultdict(<class 'int'>, {'send': 1, 'be': 1, 'old': 1, 'pray': 1, 'appear': 1})}
Accuracy: 0.969653Accuracy: 0.968247

Accuracy: 0.967779Accuracy: 0.968091

Accuracy: 0.969549Accuracy: 0.969132
Accuracy: 0.968664

Accuracy: 0.970694Accuracy: 0.968195

Accuracy: 0.967779
Accuracy: 0.970225
32345990Accuracy: 0.967935
Accuracy: 0.969757Accuracy: 0.969236{'M': defaultdict(<class 'int'>, {'see': 2, 'decide': 1, 'do': 1, 'end': 1, 'approve': 1, 'know': 1}), 'F': defaultdict(<class 'int'>, {'turn': 1, 'rich': 1, 'spoil': 1, 'bratty': 1, 'young': 1, 'conceit': 1, 'attract': 1, 'complicate': 1, 'be': 1, 'see': 1, 'sing': 2, 'true': 1})}Accuracy: 0.968091
Accuracy: 0.968404




Accuracy: 0.970798
Accuracy: 0.966582
23454846
{'M': defaultdict(<class 'int'>, {'be': 10, 'have': 2, 'fo

In [24]:
x1 = result

In [15]:
## Save frequency
import pickle
output = open('india_2000_freq_1.pkl', 'wb')
pickle.dump(frequency_list, output)
output.close()

In [18]:
import pickle
pkl_file = open('india_2000_freq_1.pkl', 'rb')

frequency_list = pickle.load(pkl_file)


In [21]:
# Combine frequency from different results
frequency_list = {'M':defaultdict(int),'F':defaultdict(int)}

for freq in result:
    if freq is None:
        continue
    for k, v in freq['M'].items():
        frequency_list['M'][k]+=v
    for k, v in freq['F'].items():
        frequency_list['F'][k]+=v  

# Calculate Odds Ratio

In [22]:
odds_ratio = {}
threshold  = 2
topk       = 50

total_num_f = sum(frequency_list['F'].values())
total_num_m = sum(frequency_list['M'].values())

for key in frequency_list['F'].keys():
    m_num = frequency_list['M'][key]
    f_num = frequency_list['F'][key]
    non_f_num = total_num_f - f_num
    non_m_num = total_num_m - m_num
    if f_num >= threshold and m_num >= threshold:
        # we only consider the events where there are at least {thresohld} occurences for both gender
        odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2)
    else:
        continue

In [23]:
from operator import itemgetter

dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

({'senior': 5.44,
  'old': 3.05,
  'terrorist': 2.77,
  'ready': 2.77,
  'responsible': 2.56,
  'drunk': 2.55,
  'sure': 2.55,
  'entire': 2.55,
  'good': 2.47,
  'due': 2.39,
  'wrong': 2.34,
  'successful': 2.34,
  'last': 2.3,
  'local': 2.23,
  'unknown': 2.23,
  'major': 2.02,
  'present': 1.7,
  'true': 1.6,
  'small': 1.59,
  'evil': 1.59,
  'happy-go-lucky': 1.59,
  'maternal': 1.59,
  'rival': 1.59,
  'high': 1.59,
  'final': 1.59,
  'suspicious': 1.49,
  'free': 1.49,
  'new': 1.45,
  'real': 1.41,
  'rich': 1.36,
  'muslim': 1.35,
  'much': 1.35,
  'unable': 1.31,
  'second': 1.27,
  'helpless': 1.27,
  'main': 1.27,
  'arrogant': 1.27,
  'anxious': 1.27,
  'hard': 1.27,
  'daily': 1.27,
  'ashamed': 1.27,
  'maya': 1.27,
  'alcoholic': 1.27,
  'past': 1.27,
  'notorious': 1.27,
  'clear': 1.27,
  'afraid': 1.27,
  'dev': 1.27,
  'mock': 1.27,
  'native': 1.27},
 {'beautiful': 0.09,
  'female': 0.11,
  'pregnant': 0.23,
  'biological': 0.27,
  'ill': 0.29,
  'teenage': 0.32,

In [67]:
top_m

['fumbles',
 'bowl',
 'common',
 'different',
 'avoid',
 'suffers',
 'acquitted',
 'melodious',
 'breaks',
 'born']

# Weat Score Calculation

In [None]:
def swAB(W, A, B):
    """Calculates differential cosine-similarity between word vectors in W, A and W, B
     Arguments
              W, A, B : n x d matrix of word embeddings stored row wise
    """
    WA = cosine_similarity(W,A)
    WB = cosine_similarity(W,B)

    #Take mean along columns
    WAmean = np.mean(WA, axis = 1)
    WBmean = np.mean(WB, axis = 1)

    return (WAmean - WBmean)
  
def test_statistic(X, Y, A, B):
    """Calculates test-statistic between the pair of association words and target words
     Arguments
              X, Y, A, B : n x d matrix of word embeddings stored row wise
     Returns
              Test Statistic
    """
    return (sum(swAB(X, A, B)) - sum(swAB(Y, A, B)))

In [None]:
def weat_effect_size(X, Y, A, B, embd):
    """Computes the effect size for the given list of association and target word pairs
     Arguments
              X, Y : List of association words
              A, B : List of target words
              embd : Dictonary of word-to-embedding for all words
     Returns
              Effect Size
    """

    Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
    Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
    Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
    Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])

    XuY = list(set(X).union(Y))
    XuYmat = []
    for w in XuY:
    if w.lower() in embd:
        XuYmat.append(embd[w.lower()])
    XuYmat = np.array(XuYmat)


    d = (np.mean(swAB(Xmat,Amat,Bmat)) - np.mean(swAB(Ymat,Amat,Bmat)))/np.std(swAB(XuYmat, Amat, Bmat))

    return d

In [None]:
def random_permutation(iterable, r=None):
    """Returns a random permutation for any iterable object"""
    pool = tuple(iterable)
    r = len(pool) if r is None else r
    return tuple(random.sample(pool, r))

def weat_p_value(X, Y, A, B, embd, sample = 1000):
    """Computes the one-sided P value for the given list of association and target word pairs
     Arguments
              X, Y : List of association words
              A, B : List of target words
              embd : Dictonary of word-to-embedding for all words
              sample : Number of random permutations used.
     Returns
    """
    size_of_permutation = min(len(X), len(Y))
    X_Y = X + Y
    test_stats_over_permutation = []

    Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
    Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
    Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
    Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])

    if not sample:
        permutations = combinations(X_Y, size_of_permutation)
    else:
        permutations = [random_permutation(X_Y, size_of_permutation) for s in range(sample)]

    for Xi in permutations:
    Yi = filterfalse(lambda w:w in Xi, X_Y)
    Ximat = np.array([embd[w.lower()] for w in Xi if w.lower() in embd])
    Yimat = np.array([embd[w.lower()] for w in Yi if w.lower() in embd])
    test_stats_over_permutation.append(test_statistic(Ximat, Yimat, Amat, Bmat))

    unperturbed = test_statistic(Xmat, Ymat, Amat, Bmat)
  
  is_over = np.array([o > unperturbed for o in test_stats_over_permutation])
  
  return is_over.sum() / is_over.size

In [None]:
"""List of association and target word pairs for the sample test (top_m, top_f) vs (Pleasant, Unpleasant)"""


X = top_m

Y = top_f

# Find better pleasant and unpleasant words

A = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal", "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle", "sunrise", "family",
    "happy", "laughter", "paradise", "vacation"] 

B = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison", "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly", "cancer", "kill", "rotten",
    "vomit", "agony", "prison"] 


resourceFile = ''
glove = KeyedVectors.load_word2vec_format(resourceFile + 'gensim_glove.840B.300d.txt.bin', binary=True)
print('The glove embedding has been loaded!')

"""Compute the effect-size and P value"""
print('WEAT d = ', weat_effect_size(X, Y, A, B, glove))
print('WEAT p = ', weat_p_value(X, Y, A, B, glove, 1000))