In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import itertools
import emoji

In [2]:
EMOJI_SET = set(emoji.UNICODE_EMOJI.keys())

In [3]:
%matplotlib notebook

In [4]:
F = pd.read_csv('results/female.csv')
M = pd.read_csv('results/male.csv')
print(len(F)/(len(F) + len(M)))

0.7449633901688092


In [5]:
# Fix gender distributions in data. This is silly; batching might work better.
s = min(len(F), len(M))

F = F[:s]
M = M[:s]

In [57]:
def tokenize(s):
    tokens = []
        
    s = s.strip().lower()
    
    s = s.replace('she/her', '')
    s = s.replace('he/him', '')
    
    for char in ',.:();|':
        s = s.replace(char, '')
        
    for char in '/\n':
        s = s.replace(char, ' ')


    
    ignore_words = {'a', 'and', 'it', }
    
    tokens += list(filter(
        lambda t: t != '' and t not in ignore_words,
        map(str.strip, s.split(' '))
    ))
    
    
    
    return tokens

def populate(token_freq_dict, df):
    for desc in df.description:
        for token in tokenize(desc):
            token_freq_dict[token] = (token_freq_dict.get(token) or 0) + 1
                
                
    return token_freq_dict

In [58]:
token_freqs_F = populate({}, F)
token_freqs_M = populate({}, M)

In [59]:
def delete(d, k):
    try:
        del d[k]
    except KeyError:
        pass

# Only keep words with more then 1000 total occurences to avoid skewing
for token in set([*token_freqs_F.keys(), *token_freqs_M.keys()]):
    total = (token_freqs_F.get(token) or 0) + (token_freqs_M.get(token) or 0)
    if total < 100:
        delete(token_freqs_F, token)
        delete(token_freqs_M, token)
        #tf_F_small[token] = token_freqs_F.get(token) or 0
        #tf_M_small[token] = token_freqs_M.get(token) or 0

In [60]:
print('min F', min(token_freqs_F.values()))
print('len F', len(token_freqs_F))
print('min M', min(token_freqs_M.values()))
print('len M', len(token_freqs_M))


tokens_in_F = sum(token_freqs_F.values())
tokens_in_M = sum(token_freqs_M.values())

print(tokens_in_F)
print(tokens_in_M)

min F 1
len F 1991
min M 3
len M 1992
535021
658181


In [61]:
def P_female_given(tokens):
    P_H = 0.5
    
    for token in tokens:
        if token_freqs_F.get(token) is None or token_freqs_M.get(token) is None:
            continue
        
        # P(H)P(E|H) / P(H)P(E|H) + P(-H)P(E|-H)
        P_E_GIVEN_H      = token_freqs_F[token] / tokens_in_F
        P_E_GIVEN_NULL_H = token_freqs_M[token] / tokens_in_M

        P_NULL_H = 1 - P_H
        P_H = (P_H*P_E_GIVEN_H) / (P_H*P_E_GIVEN_H + P_NULL_H*P_E_GIVEN_NULL_H)
    
    return P_H

In [62]:
R = {
    'correct': 0,
    'correct_w': 0, # weighted correct
    'no_tok': 0,
    'took': 0,
}

def do_it(iterator, gender, R):
    for bio in iterator:
        start = time.monotonic()
        tokens = tokenize(bio)
        P_female = P_female_given(tokens)
        R['took'] += time.monotonic() - start

        R['no_tok'] += P_female == 0.5
        
        R['correct'] += (P_female > 0.5) if gender == 'F' else (P_female <= 0.5)
        R['correct_w'] += P_female if gender == 'F' else (1-P_female)
        
        # R['conf'].append( P_female if gender == 'F' else (1 - P_female) )
    
    return R

do_it(M.description, 'M', R)
do_it(F.description, 'F', R)
R['accuracy'] = R['correct'] / (len(M) + len(F))
R

{'correct': 105880,
 'correct_w': 101571.91223332938,
 'no_tok': 9218,
 'took': 1.815723261614039,
 'accuracy': 0.704626523984454}

In [56]:
# Show most confused

import random


f = random.sample(list(zip(M.username, F.description)), 100)
m = random.sample(list(zip(M.username, M.description)), 100)

shuff = [(*a,'F') for a in f] + [(*a,'M') for a in m]
random.shuffle(shuff)

errors = []

for name, bio, gender in shuff:
    t = tokenize(bio)
    p = P_female_given(t)
    error = (gender == 'F') - p
    errors.append((name, t, gender, p, error))
    

i = 0
for username, t, gender, p, error in errors: # sorted(errors, key=lambda x: x[-1]):
    print(' '.join(t))
    print(f'correct={gender} guess={p} err={error} username={username}')
    print('-'*30)
    i += 1
    if i > 20:
        break

        


draws100% animates0% looπδ ggs
correct=M guess=0.5 err=-0.5 username=itsuizzu
------------------------------
opposite opposite ela dela -
correct=F guess=0.9996000800346002 err=0.00039991996539978647 username=InkedAmby
------------------------------
️ • ☼︎ aˢᶜ
correct=F guess=0.7002646373983767 err=0.29973536260162326 username=TsarAlek
------------------------------
mostly blm music stuff • acab • • lowkey trying to be an oasis fan account also among us professional player
correct=M guess=0.14471099591901637 err=-0.14471099591901637 username=r0llwith1t
------------------------------
stray kids! stz - 96' liner to much love for those babies! fan account of @stray_kids portugues english
correct=F guess=0.9970137157918624 err=0.002986284208137624 username=lecasp_
------------------------------
youtuber does edits occasionally casual creative player dying to have hotwire back main @winterdutchie
correct=F guess=0.007776997201797848 err=0.9922230027982022 username=almondsnjoy
--------------

In [None]:
t = tokenize("Ace💜She/Her/ Animator for Bento Box and a Character Designer that loves the weather and cute monsters ☀️ 📧: achrosny@gmail.com ☀️ Thoughts/Opinions are my own")

In [None]:
t.lower().replace('she/her', '')

In [None]:
t[20]

In [None]:
P_female_given(['monsters'])

In [None]:
P_female_given([''])

In [None]:
t[::-1]

In [24]:
s = 'well-rounded geek freelance manga adapter/rewriter caretaker of 2 siberian cats current fannish love 镇魂 | guardian (better reflected elsewhere )'
t = tokenize(s)
t

['well-rounded',
 'geek',
 'freelance',
 'manga',
 'adapter/rewriter',
 'caretaker',
 'of',
 '2',
 'siberian',
 'cats',
 'current',
 'fannish',
 'love',
 '镇魂',
 '|',
 'guardian',
 '(better',
 'reflected',
 'elsewhere',
 ')']

In [64]:
P_female_given('football')

0.6975678459068225