# Get adjectives related to male and female

In [19]:
import numpy as np
from itertools import combinations, filterfalse
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import random
import sys
import os
import pickle

import pandas as pd
from pandas.core import indexing
from gender_predictor.GenderClassifier import classify_gender
from collections import defaultdict
from tqdm.notebook import tqdm
import multiprocessing as mp

In [53]:
character = pd.read_csv('character.metadata.tsv', sep='\t', skip_blank_lines=True, header=None, names=['id', 'free_id', 'release_date','char_name', 'dob', 'gender', 'height', 'ethnicity', 'name', 'age', 'free_char_id1', 'free_char_id2', 'free_char_id3'])
character = character[['id', 'char_name', 'gender']]


female_words = ['she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother']
male_words = ['he', 'his', 'man', 'male', 'men', 'boys', 'gentleman', 'gentlemen', 'boy', 'bloke', 'brother', 'father']

count = 0
def read_input_file(filename):
    data_df = pd.read_csv(filename,sep=',', skip_blank_lines=True, index_col= False)
    return data_df


def get_plots_by_movie_id(data_df):
    movie_ids = data_df.movie_id.unique() 
    grouped = data_df.groupby(data_df.movie_id)

    all_movie_plots = []
    for id in movie_ids:
        sents_df = grouped.get_group(id)
        all_movie_plots.append(sents_df)
    return all_movie_plots[0:1000]


def get_frequency_for_movie(movie):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    name_data  = movie[((movie.dep_pos == 'NNP') & (movie.dep_ner == 'PERSON')) | (movie.dependent.isin(female_words)) | (movie.dependent.isin(male_words))]
    char_list  = character[character.id==movie.iloc[0]['movie_id']]
    for idx,name in name_data.iterrows():            
        try:
            character_name = name['dependent'].lower()
            gender = None
            if character_name in female_words:
                gender = 'F'
            elif character_name in male_words:
                gender = 'M'
            else:
                for ix, char in char_list.iterrows():
                    chk = str(char['char_name'])
                    if character_name in chk.lower():
                        gender = char['gender']
                        break
                    
                if gender is None:
                    gender = classify_gender(character_name)
        
            governor = int(name['governor'])
            governor_df = movie[(movie['sentence_id']==name['sentence_id']) & (movie['token_id'] == governor) & (movie['dep_pos']=='JJ')]
            df2 = movie[(movie['sentence_id']==name['sentence_id']) & (movie['governor'] == name['token_id'])  & (movie['dep_pos']=='JJ')]
            df3 = movie[(movie['sentence_id'] == name['sentence_id']) & (movie['governor']==name['governor']) & (movie['dep_pos']=='JJ')]
            y = pd.concat([governor_df, df2, df3]).drop_duplicates()
            for i, x in y.iterrows():
                frequency_list[gender][x['dependent']] +=1
        except Exception as exc:
            pass
    print(movie.iloc[0]['movie_id'])
    print(frequency_list)
    return frequency_list
    
def get_name_and_adjective_mapping(all_movie_plots):
    frequency_list = {'M':defaultdict(int), 'F':defaultdict(int)}
    pool = mp.Pool(10)
    results = [pool.apply_async(get_frequency_for_movie, args=(movie,)) for movie in all_movie_plots]    
    output = [p.get() for p in results]
    return output


def get_adjective_cloud(filename):
    movie_data_df = read_input_file(filename)
    all_movie_plots = get_plots_by_movie_id(movie_data_df)
    name_adj_cluster_list = get_name_and_adjective_mapping(all_movie_plots)
    return name_adj_cluster_list



In [None]:
%time result = get_adjective_cloud('usa.csv')

Accuracy: 0.971110
Accuracy: 0.970850
Accuracy: 0.969653Accuracy: 0.969601

Accuracy: 0.970538
Accuracy: 0.968091Accuracy: 0.968456

975900Accuracy: 0.970069
{'M': defaultdict(<class 'int'>, {'many': 1, 'fierce': 1}), 'F': defaultdict(<class 'int'>, {})}

Accuracy: 0.970434
Accuracy: 0.968664
4951456
{'M': defaultdict(<class 'int'>, {}), 'F': defaultdict(<class 'int'>, {'American': 1})}
Accuracy: 0.969757
25960460
{'M': defaultdict(<class 'int'>, {}), 'F': defaultdict(<class 'int'>, {'previous': 1})}
Accuracy: 0.968195
Accuracy: 0.967779
Accuracy: 0.969965
Accuracy: 0.970590
Accuracy: 0.967779
Accuracy: 0.968508
156558
{'M': defaultdict(<class 'int'>, {'unemployed': 1, 'subject': 1, 'unsuccessful': 2, 'mature': 1, 'own': 1, 'pregnant': 1, 'third': 1, 'old': 1}), 'F': defaultdict(<class 'int'>, {'other': 1, 'much': 1, 'pregnant': 1, 'second': 1})}
Accuracy: 0.968247
Accuracy: 0.969913
Accuracy: 0.970954
Accuracy: 0.970590
Accuracy: 0.970121
Accuracy: 0.970382
Accuracy: 0.970173
Accuracy

In [39]:
len(result)

100

In [41]:
## Save frequency
import pickle
output = open('usa_freq_1.pkl', 'wb')
pickle.dump(frequency_list, output)
output.close()

In [11]:
pkl_file = open('india_freq_1.pkl', 'rb')

frequency_list = pickle.load(pkl_file)


In [40]:
# Combine frequency from different results
frequency_list = {'M':defaultdict(int),'F':defaultdict(int)}

for freq in result:
    for k, v in freq['M'].items():
        frequency_list['M'][k]+=v
    for k, v in freq['F'].items():
        frequency_list['F'][k]+=v  

# Calculate Odds Ratio

In [16]:
## Odds Ratio

events = set()
m_total = 0
f_total = 0

for key, value in frequency_list['M'].items():
    events.add(key)
    m_total += value
    
for key, value in frequency_list['F'].items():
    events.add(key)
    f_total += value
    
odds_ratio = {}
for event in events:
    try:
        odds_ratio[event] = (frequency_list['M'][event] * (f_total - frequency_list['F'][event]))/(frequency_list['F'][event] * (m_total - frequency_list['M'][event]))
    except:
        odds_ratio[event] = 0
        pass
    
sorted_odd_ratio = {k: v for k, v in sorted(odds_ratio.items(), key=lambda item: item[1])}

top_m = list(sorted_odd_ratio.keys())[:100]
top_f = list(sorted_odd_ratio.keys())[-100:]

In [18]:
top_f

['pitched',
 'fatherlike',
 'Vikrant',
 'devil-worshipping',
 'merry',
 'Mythili',
 'fabulous',
 'scenic',
 'astounded',
 'senile',
 'brain-dead',
 'captive',
 'unorthodox',
 'fishy',
 'immediate',
 'territorial',
 'truthful',
 'generous',
 'dejected',
 'homeless',
 'Ankush',
 'quick',
 'ardent',
 'decent',
 'deceased',
 'unofficial',
 'powerful',
 'false',
 'Nirmal',
 'unsuspecting',
 'worth',
 'loud',
 'slow',
 'pitch-dark',
 'grim',
 'oppressed',
 'graduate',
 'precious',
 'Aliyah',
 'impressive',
 'helter-skelter',
 'wide-eyed',
 'undying',
 'spellbound',
 'non-linear',
 'Hilarious',
 'heart-to-heart',
 'hallucinating',
 'leftist',
 'frantic',
 'Muni',
 'front',
 'depressed',
 'vast',
 'Devanathan',
 'anti-national',
 'Good',
 'dignified',
 'Eashwar',
 'imaginary',
 'voice-over',
 'secluded',
 'dissolute',
 'extra',
 'subsequent',
 'scared',
 'grown-up',
 'unnamed',
 'vicious',
 'startling',
 'Shankaran',
 'left',
 'total',
 'Devdas',
 'good-hearted',
 'unavailable',
 'Harshwardhan

# Weat Score Calculation

In [None]:
def swAB(W, A, B):
    """Calculates differential cosine-similarity between word vectors in W, A and W, B
     Arguments
              W, A, B : n x d matrix of word embeddings stored row wise
    """
    WA = cosine_similarity(W,A)
    WB = cosine_similarity(W,B)

    #Take mean along columns
    WAmean = np.mean(WA, axis = 1)
    WBmean = np.mean(WB, axis = 1)

    return (WAmean - WBmean)
  
def test_statistic(X, Y, A, B):
    """Calculates test-statistic between the pair of association words and target words
     Arguments
              X, Y, A, B : n x d matrix of word embeddings stored row wise
     Returns
              Test Statistic
    """
    return (sum(swAB(X, A, B)) - sum(swAB(Y, A, B)))

In [None]:
def weat_effect_size(X, Y, A, B, embd):
    """Computes the effect size for the given list of association and target word pairs
     Arguments
              X, Y : List of association words
              A, B : List of target words
              embd : Dictonary of word-to-embedding for all words
     Returns
              Effect Size
    """

    Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
    Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
    Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
    Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])

    XuY = list(set(X).union(Y))
    XuYmat = []
    for w in XuY:
    if w.lower() in embd:
        XuYmat.append(embd[w.lower()])
    XuYmat = np.array(XuYmat)


    d = (np.mean(swAB(Xmat,Amat,Bmat)) - np.mean(swAB(Ymat,Amat,Bmat)))/np.std(swAB(XuYmat, Amat, Bmat))

    return d

In [None]:
def random_permutation(iterable, r=None):
    """Returns a random permutation for any iterable object"""
    pool = tuple(iterable)
    r = len(pool) if r is None else r
    return tuple(random.sample(pool, r))

def weat_p_value(X, Y, A, B, embd, sample = 1000):
    """Computes the one-sided P value for the given list of association and target word pairs
     Arguments
              X, Y : List of association words
              A, B : List of target words
              embd : Dictonary of word-to-embedding for all words
              sample : Number of random permutations used.
     Returns
    """
    size_of_permutation = min(len(X), len(Y))
    X_Y = X + Y
    test_stats_over_permutation = []

    Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
    Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
    Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
    Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])

    if not sample:
        permutations = combinations(X_Y, size_of_permutation)
    else:
        permutations = [random_permutation(X_Y, size_of_permutation) for s in range(sample)]

    for Xi in permutations:
    Yi = filterfalse(lambda w:w in Xi, X_Y)
    Ximat = np.array([embd[w.lower()] for w in Xi if w.lower() in embd])
    Yimat = np.array([embd[w.lower()] for w in Yi if w.lower() in embd])
    test_stats_over_permutation.append(test_statistic(Ximat, Yimat, Amat, Bmat))

    unperturbed = test_statistic(Xmat, Ymat, Amat, Bmat)
  
  is_over = np.array([o > unperturbed for o in test_stats_over_permutation])
  
  return is_over.sum() / is_over.size

In [None]:
"""List of association and target word pairs for the sample test (top_m, top_f) vs (Pleasant, Unpleasant)"""


X = top_m

Y = top_f

# Find better pleasant and unpleasant words

A = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal", "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle", "sunrise", "family",
    "happy", "laughter", "paradise", "vacation"] 

B = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison", "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly", "cancer", "kill", "rotten",
    "vomit", "agony", "prison"] 


resourceFile = ''
glove = KeyedVectors.load_word2vec_format(resourceFile + 'gensim_glove.840B.300d.txt.bin', binary=True)
print('The glove embedding has been loaded!')

"""Compute the effect-size and P value"""
print('WEAT d = ', weat_effect_size(X, Y, A, B, glove))
print('WEAT p = ', weat_p_value(X, Y, A, B, glove, 1000))