# Description

### Import the libraries we'll use.
`%matplotlib inline` lets us see charts and plots right here in the notebook!

In [None]:
from collections import Counter

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize 
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

%matplotlib inline

### Read our data.

In [None]:
profiles = pd.read_csv('data/profiles.csv')

In [None]:
profiles = profiles.sample(20000)
profiles = profiles.reset_index(drop=True)
profiles.columns

### A little housekeeping...
Expand for more.

- The OKC data has 10 different columns with profile text, one for each long-answer question in users' profiles.
- We want to look at all of the profile text, so this cell merges it all together in a new column called `text`.

### The code

In [None]:
essay_cols = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 
              'essay7', 'essay8', 'essay9']

def concat(row, cols):
    tmp = []
    for c in cols:
        tmp.append(str(row[c]))
    new = '\n'.join(tmp)
    return new

profiles['text'] = profiles.apply(concat, axis=1, cols=essay_cols)

profiles = profiles[['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 
                     'ethnicity', 'height', 'income', 'job', 'last_online', 
                     'location', 'offspring', 'orientation', 'pets', 'religion', 
                     'sex', 'sign', 'smokes', 'speaks', 'status', 'text']]

profiles.head(3)

#### Let's peak at an example of the text so we know what we're working with.

In [None]:
profiles.text[5]

### Tokenizing text

### We want to split the text into words.
Expand for details

- We can do this by applying the `split()` function to text in every profile. 
- Notice, however, that this is a little messy.
    - `split()` is just cutting up the text based on the spaces, leaving the punctuation and some HTML things mized in with our words.

### A first try

In [None]:
tmp = profiles['text'].apply(lambda x: x.split())
tmp.head()

### Getting text from words
Expand for details

Here we define a function to clean up the text a bit more. It does a few things:
- Removes HTML code from the text using BeautifulSoup. (Remember, we want just the words people actually typed.) 
- Converts all of the text to lowercase, so that `Hello`, `hello`, `"HeLlO`, and `HELLO` all look the same to the computer.
- Uses the Natural Language Tool Kit (`nltk`) to tokenize the remaining text. 
    - "Tokenize" is jargon for splitting text into "tokens." Tokens are usually words, but they could be sentences, paragraphs, letters, or whatever we needed. 
    - The nltk tokenizers are much smarter than the simple `string.split()` function we used before. This one (which we imported in the beginning) selects the words, but ignores the whitespace and punctuation.

### A second try

In [None]:
def clean(text):
    t = BeautifulSoup(text, 'lxml').get_text()
    
    bad_words = ['http', 'www', '\nnan']
    for b in bad_words:
        t = t.replace(b, '')
    
    t = t.lower()
    t = regexp_tokenize(t, '\w+')
    return t

profiles['tokens'] = profiles['text'].apply(clean)
profiles.tokens.head()

In [None]:
men = profiles[profiles['sex'] == 'm']
women = profiles[profiles['sex'] == 'f']

In [None]:
men.tokens.head()

In [None]:
keep_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
              'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
              'himself', 'she', 'her', 'hers', 'herself', 'they', 'them', 'their',
              'theirs', 'themselves']

sw = set(stopwords.words('english'))

for k in keep_words:
    sw.discard(k) #could use remove if we wanted keyerrors
    
print(sw)

In [None]:
def flatten(series):
    l = []
    for x in series:
        l.extend(x) #each x is a list we want to unnest
    return l

tmp = flatten(men.tokens)
tmp = (x for x in tmp if x not in sw)

mens_words = Counter(tmp)
mens_words.most_common(10)

In [None]:
tmp = flatten(women.tokens)
tmp = (x for x in tmp if x not in sw)
womens_words = Counter(tmp)
womens_words.most_common(10)

In [None]:
tmp = {'women': womens_words,
       'men': mens_words
      }

popular_words = pd.DataFrame(tmp)

popular_words['men'] = (popular_words['men'] /  popular_words['men'].sum())*100
popular_words['women'] = (popular_words['women'] /  popular_words['women'].sum())*100

popular_words.sort_values(by='men', inplace=True, ascending=False)
popular_words.head().round(2)

In [None]:
popular_words['max'] = popular_words.max(axis=1)
popular_words = popular_words.sort_values(by='max', ascending=False)
popular_words.head(10).round(2)

In [None]:
popular_words['max'].hist(bins=100)

In [None]:
popular_words = popular_words.head(1000)
print(popular_words.shape)
popular_words['max'].hist(bins=100)

In [None]:
def times_diff(row):
    if row.men > row.women:
        return row.men / row.women
    else:
        return -1 * (row.women / row.men)
    
popular_words['times_diff'] = popular_words.apply(times_diff, axis=1)
popular_words = popular_words.sort_values(by='max', ascending=False)

print('Most popular words:')
popular_words.head(10).round(3)

In [None]:
popular_words = popular_words.sort_values(by='times_diff', ascending=False)

print('Words men use more than women:')
popular_words.head(15).round(3)

In [None]:
popular_words = popular_words.sort_values(by='times_diff', ascending=True)

print('Words women use more than men:')
popular_words.head(15).round(3)