# Description

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize 
from collections import Counter
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
profiles = pd.read_csv('data/profiles.csv')

In [None]:
profiles = profiles.head(20000)
profiles.columns

In [None]:
essay_cols = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 
              'essay7', 'essay8', 'essay9']

def concat(row, cols):
    tmp = []
    for c in cols:
        tmp.append(str(row[c]))
    new = '\n'.join(tmp)
    return new

profiles['text'] = profiles.apply(concat, axis=1, cols=essay_cols)

profiles = profiles[['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 
                     'ethnicity', 'height', 'income', 'job', 'last_online', 
                     'location', 'offspring', 'orientation', 'pets', 'religion', 
                     'sex', 'sign', 'smokes', 'speaks', 'status', 'text']]

profiles.head()

In [None]:
profiles.text[5]

In [None]:
tmp = profiles['text'].apply(lambda x: x.split())
tmp.head()

In [None]:
def clean(text):
    bad_words = ["<br />", "href", "\nnan", '<a class="ilink" href="', 
                 '</a>', '">', '/interests?i=', 'class="ilink"', 'ilink', 
                 '.com', 'http', 'class=']
    t = text
    for b in bad_words:
        t = t.replace(b, '')
    t = t.lower()
    t = regexp_tokenize(t, '\w+')
    return t

profiles['tokens'] = profiles['text'].apply(clean)
profiles.tokens.head()

In [None]:
men = profiles[profiles['sex'] == 'm']
women = profiles[profiles['sex'] == 'f']

In [None]:
men.tokens.head()

In [None]:
keep_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
              'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
              'himself', 'she', 'her', 'hers', 'herself', 'they', 'them', 'their',
              'theirs', 'themselves']

sw = set(stopwords.words('english'))

for k in keep_words:
    sw.discard(k) #could use remove if we wanted keyerrors
    
print(sw)

In [None]:
def flatten(series):
    l = []
    for x in series:
        l.extend(x) #each x is a list we want to unnest
    return l

tmp = flatten(men.tokens)
tmp = (x for x in tmp if x not in sw)

mens_words = Counter(tmp)
mens_words.most_common(10)

In [None]:
tmp = flatten(women.tokens)
tmp = (x for x in tmp if x not in sw)
womens_words = Counter(tmp)
womens_words.most_common(10)

In [None]:
tmp = {'women': womens_words,
       'men': mens_words
      }

popular_words = pd.DataFrame(tmp)
popular_words['count'] = popular_words.men + popular_words.women
popular_words.sort_values(by='count', inplace=True, ascending=False)
popular_words.head()

In [None]:
popular_words['count'].hist(bins=100)

In [None]:
popular_words = popular_words.sort_values(by='count', ascending=False).head(300)
print(popular_words.shape)
popular_words['count'].hist(bins=100)

In [None]:
def times_diff(row):
    if row.men > row.women:
        return row.men / row.women
    else:
        return row.women / row.men
    

popular_words['men'] = (popular_words['men'] /  popular_words['men'].sum())*100
popular_words['women'] = (popular_words['women'] /  popular_words['women'].sum())*100
popular_words['times_diff'] = popular_words.apply(times_diff, axis=1)

popular_words = popular_words.sort_values(by='times_diff', ascending=False)

popular_words.head(30)