In [None]:
import json
with open("India_event_chain_by_each_year.json") as f:
    result = json.load(f)

In [None]:
# Combine frequency from different results
frequency_list = {'M':list(),'F':list()}

for year in result:
    # Fill in the desired decade
    if year and year.isdigit() and int(year) < 2000:
        for freq in result[year]:
            if freq is None:
                continue
            frequency_list['M'].append(". ".join(freq['M']))
            frequency_list['F'].append(". ".join(freq['F']))

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
import string
exclude = set(string.punctuation)

def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''  
    in_string = ''.join([ch for ch in in_string if ch not in exclude])
    in_string = in_string.lower()
    in_string = ' '.join(in_string.split())
    return in_string

def weighted_log_odds_ratio(l1, l2, ngram = 1, prior=.01, cv = None):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error = 'ignore', ngram_range=(ngram, ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))        
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v:k for k,v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = []
    for i in sorted_indices:
        return_list.append((index_to_term[i], z_scores[i]))
    return return_list

In [None]:
# Get unigrams - anchors

In [None]:
odds_ratio = bayes_compare_language(frequency_list['M'], frequency_list['F'], 1)

from operator import itemgetter
topk = 250
top_m = dict(sorted(odds_ratio, key=itemgetter(1), reverse=True)[:topk])
top_f = dict(sorted(odds_ratio, key=itemgetter(1))[:topk])

In [None]:
words = ['the', 'son', 'she', 'her', 'woman', 'women', 'ladies', 'girls', 'lady', 'aunt', 'grandmother', 'female', 'girl', 'damsel', 'maiden', 'daughter', 'sister', 'mother', 'he', 'his', 'man', 'male', 'men', 'boys', 'gentleman', 'uncle', 'grandfather', 'gentlemen', 'boy', 'bloke', 'brother', 'father', 'their', 'they']

x = []
for word in top_f.keys():
    if word not in words and 'attime' not in word:
        x.append(word)
pruned_f = list(x)

x = []
for word in top_m.keys():
    if word not in words and 'attime' not in word:
        x.append(word)
pruned_m = list(x)

In [None]:
pruned_m

In [None]:
pruned_F

In [None]:
# Get Bigrams

In [None]:
odds_ratio = bayes_compare_language(frequency_list['M'], frequency_list['F'], 2)

from operator import itemgetter
top_m = dict(sorted(odds_ratio, key=itemgetter(1), reverse=True))
top_f = dict(sorted(odds_ratio, key=itemgetter(1)))

In [None]:
# Male Bigrams
x = []
count = 250
for word in top_m:
    words = word.split(' ')
    if words[0] == words[1]:
        continue
    if words[0] in pruned_m[:1000] + pruned_f[:1000] and words[1] in pruned_m[:1000] + pruned_f[:1000]:        
        count-=1
        x.append(word)
    if count==0:
        break
json.dumps(x)  

In [None]:
# Female Bigrams
x = []
count = 250
for word in top_f:
    words = word.split(' ')
    if words[0] == words[1]:
        continue
    if words[0] in pruned_m[:1000] + pruned_f[:1000] and words[1] in pruned_m[:1000] + pruned_f[:1000]:        
        count-=1
        x.append(word)
    if count==0:
        break
json.dumps(x)  