#Building a Co-occurrence model using LIWC

## original file

In [45]:
''' Make LIWC feature extractor into class
'''
import nltk
import re
import pickle
liwcPath = '../data/LIWC2007_English_plus_txt.dic'

def makeLIWCDictionary(liwcPath, picklePath):
    '''
        Make lookup data structure from LIWC dictionary file
    '''
    LIWC_file = open(liwcPath, 'rU') # LIWC dictionary
    catNames = {}
    LIWC_file.readline() #skips first '%' line
    line = LIWC_file.readline()
    lookup = []
    while '%' not in line:
    	keyval = line.split('\t')
    	key = keyval[0]
    	value = keyval[1].strip()
    	catNames[key] = {'name' : value,
                         'words' : []}
    	line = LIWC_file.readline()
    mapCategoriesToNumbers = catNames.keys()
    line = LIWC_file.readline() # skips second '%' line

    #return mapCategoriesToNumbers
    while line: #iterate through categories
    	data = line.strip().split('\t')
    	reString = '^'+data[0].replace('*', '.*') + '$'
        indeces = [mapCategoriesToNumbers.index(d) for d in data[1:]]
    	lookupCell = (re.compile(reString), indeces)
        lookup.append(lookupCell)
        for cat in data[1:]:
            catNames[cat]['words'] += (data[0], reString)
    	cats = data[1:]
    	line = LIWC_file.readline()
    toPickle = {'categories' : catNames, 'lookup' : lookup, 'cat_to_num' : mapCategoriesToNumbers}
    pickle.dump(toPickle, open(picklePath, 'w'))
    return toPickle

class liwcExtractor():
    def __init__(self,
                tokenizer=None,
                ignore=None,
                dictionary=None,
                newCategories=None,
                keepNonDict=True,
                liwcPath=None):
        self.liwcPath = liwcPath
        self.dictionary = dictionary
        if tokenizer is None:
            self.tokenizer = self.nltk_tokenize
        if liwcPath is not None:
            self.dictionary = makeLIWCDictionary(liwcPath, './liwcDictionary.pickle')
            self.lookup = self.dictionary['lookup']
            self.categories = self.dictionary['categories']
            self.mapCategoriesToNumbers = self.dictionary['cat_to_num']
        elif self.dictionary==None:
            self.dictionary = makeLIWCDictionary(liwcPath, './liwcDictionary.pickle')
            self.lookup = self.dictionary['lookup']
            self.categories = self.dictionary['categories']
            self.mapCategoriesToNumbers = self.dictionary['cat_to_num']
        self.ignore = ignore
        self.newCategories = newCategories
        self.nonDictTokens = []
        self.keepNonDict = keepNonDict

    def getCategoryIndeces(self):
        indeces = [x['name'] for x in self.categories.values()]
        indeces += ['wc', 'sixltr','dic','punc','emoticon'] # These last two are not built yet.
        return indeces

    def extract(self, corpus):
        corpusFeatures = []
        for doc in corpus:
            features = self.extractFromDoc(doc)
            corpusFeatures.append(features)
        return corpusFeatures

    def extractFromDoc(self, document):
        tokens = self.tokenizer(document)
        #print tokens
        features = [0] * 70 # 66 = wc, total word count
                            # 67 = sixltr, six letter words
                            # 68 = dic, words found in LIWC dictionary
                            # 70 = punc, punctuation
                            # 71 = emoticon
        features[66] = len(tokens)

        for t in tokens: #iterating through tokens of a message
            #print "Token : " + t
            if len(t) > 6: # check if more than six letters
                features[67] += 1
            inDict = False
            for pattern, categories in self.lookup:
                if len(pattern.findall(t)) > 0:
                    inDict = True
                    for c in categories:
                        features[int(c)] += 1
            if inDict:
                features[68] += 1
            else:
                self.nonDictTokens.append(t)
        return features
    
    def patternsMatchedFromDoc(self, document):
        tokens = self.tokenizer(document)
        patterns = [l[0] for l in self.lookup]
        features = [0] * len(patterns)
        for t in tokens:
            for i, pattern in enumerate(patterns):
                if len(pattern.findall(t)) > 0:
                    features[i] += 1
        return features

    def nltk_tokenize(self, message):
    	'''
    		takes in a text string and returns a list of tokenized words using nltk methods
    	'''
    	# sentence tokenize
    	stList = nltk.sent_tokenize(message)
    	# word tokenize
    	tokens = []
    	for sent in stList:
    		tokens += nltk.word_tokenize(sent)
    	return tokens
    
    
    #def android_data_tokenize(message):
        '''
    		takes in a text string and returns a list of tokenized words using nltk methods
            checks for emoticons which should be tokenized together instead of as individual
            punctuation tokens

            android data also needed to redact names, so [name] needs to be treated separately
            as well
    	'''

        #remove [name] from initial string


## Enhancing LIWC Extractor for Co-occurrence finding of new potential category candidates
1. build a matrix of all words in the matrix (columns) by all words in the LIWC dictionary (rows)
2. For all words in the LIWC dictionary that a document contains, uptick the words found in the document
3. setup FLANN with this cooccurrence matrix
4. For each document in the corpus, for each word in the document that is not in LIWC, calculate cooccurrence distance, if the distance to most words in the LIWC category is small, consider it a potential condidate for being added.

In [2]:
import psycopg2
import numpy as np

In [3]:
try:
    conn = psycopg2.connect("dbname='lsm' host='localhost'")
except:
    print "I am unable to connect to the database"

In [4]:
curr = conn.cursor()

In [5]:
q = 'SELECT message FROM message_data'
curr.execute(q)
messages = curr.fetchall()

In [6]:
len(messages)

12906

In [7]:
messages[0]

('hey are you still in gsh?',)

In [8]:
import nltk

#get the set of unique words in corpus

In [9]:
all_tokens = []
for message in messages:
    m_tokens = nltk.word_tokenize(message[0].lower().replace('[name]', ''))
    all_tokens += m_tokens

In [10]:
len(all_tokens)

147372

In [11]:
corpus_fDist = nltk.FreqDist(all_tokens)

In [12]:
tokens, frequencies = zip(*corpus_fDist.most_common(30))

In [13]:
# plot most frequent
import plotly.tools as tls
import plotly.plotly as py
from plotly.graph_objs import *

data = Data([
    Bar(
        x=tokens,
        y=np.array(frequencies) / float(len(all_tokens))
    )
])
tls.embed(py.plot(data, filename='token_frequency', auto_open=False))

## Setting up two matrices
1. For comparing just LIWC words
2. For comparing any word from corpus

In [18]:
unique_words = len(corpus_fDist)
unique_words

9933

In [15]:
all_to_all = np.zeros((unique_words, unique_words))

In [46]:
le = liwcExtractor(liwcPath=liwcPath)

In [15]:
liwcDict = le.dictionary

In [16]:
liwcDict.keys()

['lookup', 'categories']

In [19]:
liwc_to_all = np.zeros((len(liwcDict['lookup']), unique_words))

In [17]:
messages[0]

('hey are you still in gsh?',)

In [18]:
all_words_lookup = {word : index for index, word in enumerate(corpus_fDist.keys())}

In [19]:
liwc_lookup = {word[0] : index for index, word in enumerate(liwcDict['lookup'])}

In [23]:
def update_all_to_all(message):
    tokens = nltk.word_tokenize(message)
    all_indeces = [all_words_lookup[t] for t in tokens]
    for t in tokens:
        t_index = all_words_lookup[t]
        for a in all_indeces:
            all_to_all[t_index][a] += 1

In [24]:
def update_liwc_to_all(message):
    features = le.patternsMatchedFromDoc(message)
    tokens = nltk.word_tokenize(message)
    all_indeces = [all_words_lookup[t] for t in tokens]
    for i, f in enumerate(features):
        for a in all_indeces:
            liwc_to_all[i][a] += f

In [25]:
for m in messages:
    update_all_to_all(m[0].lower().replace('[name]', ''))
    update_liwc_to_all(m[0].lower().replace('[name]', ''))

In [26]:
# pickle results
import pickle
pickle.dump(all_to_all, open('all_to_all.pickle', 'w'))
#pickle.dump(liwc_to_all, open('liwc_to_all.pickle', 'w'))

# Load pickle of all to all

In [20]:
import pickle
all_to_all = pickle.load(open('all_to_all.pickle','r'))

# Scale Data to Zero Mean, Unit Variance

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()

In [23]:
all_to_all_scaled = scaler.fit_transform(all_to_all)

# Finding new words for each category
For each word in the lookup table:
    compute its distance to each word in the liwc_to_all matrix
    for each category,
        what is the distance to each 

In [24]:
from sklearn.metrics.pairwise import pairwise_distances    

In [25]:
distance_matrix = pairwise_distances(all_to_all)

## Compute the mean cooccurence vector of the words in each category

In [47]:
cToN = le.mapCategoriesToNumbers

In [49]:
cToN[:10]

['150', '133', '132', '131', '130', '137', '136', '135', '134', '252']

In [87]:
categories_to_wordind = {c['name'] : [] for c in le.categories.values()}
for regex, r_indeces in le.lookup:
    for word, index in all_words_lookup.items():
        if len(regex.findall(word)) > 0:
            for ri in r_indeces:
                categories_to_wordind[le.categories[cToN[ri]]['name']].append(index)

In [89]:
category_mean_vectors = {}
for c, word_indeces in categories_to_wordind.items():
    rows_in_category = [all_to_all_scaled[i] for i in word_indeces]
    if np.sum(rows_in_category) == 0:
        category_mean_vectors[c] = np.zeros(len(all_to_all[0]))
    else:
        mean_rows = np.mean(rows_in_category, axis=0)
        category_mean_vectors[c] = mean_rows

In [90]:
category_to_index = {c : i for c, i in enumerate(category_mean_vectors.keys())}
index_to_category = {i : c for c, i in category_to_index.items()}

In [91]:
category_mean_matrix = np.array([vector for vector in category_mean_vectors.values()])

In [96]:
category_mean_matrix[0]

array([-0.00190075, -0.06322889, -0.03011465, ...,  0.13541554,
       -0.03786646, -0.02655595])

In [110]:
distance_matrix = pairwise_distances(category_mean_matrix, Y=all_to_all_scaled, metric='cosine')

In [112]:
index_to_words = {i : w for w, i in all_words_lookup.items()}

In [132]:
category_candidates = {}

In [134]:
for c, c_index in index_to_category.items():
    category_candidates[c] = [index_to_words[i[0]] for i in sorted(enumerate(distance_matrix[c_index]), key=lambda x:x[1])]

In [149]:
category_candidates['sad'][:20]

['miss',
 'sad',
 'missed',
 '5.15',
 "16'3",
 '4.95m',
 'had16',
 'lameeee',
 'sobbing',
 'vaulted',
 'height',
 'broke',
 'missing',
 'lost',
 'hurt',
 'sooooo',
 'failing',
 '9:15',
 'teammates',
 'xtr']

In [143]:
keysToNames = {key : val['name'] for key,val in le.categories.items()}

In [144]:
namesToKeys = {val : key for key, val in keysToNames.items()}

In [155]:
pickle.dump(category_candidates, open('cosine_sim_category_candidate.pickle', 'w'))

In [156]:
cc = pickle.load(open('cosine_sim_category_candidate.pickle', 'r'))

In [159]:
cc['ppron']

['i',
 'you',
 'to',
 '.',
 'and',
 'a',
 'the',
 'my',
 'me',
 'it',
 'but',
 ',',
 'so',
 '?',
 'that',
 'have',
 "n't",
 'for',
 'do',
 '!',
 "'m",
 'we',
 'in',
 'just',
 'of',
 'if',
 'was',
 'can',
 'is',
 'be',
 'on',
 'your',
 'with',
 'not',
 'are',
 'at',
 'like',
 "'s",
 'get',
 'im',
 'up',
 ':',
 'know',
 ')',
 "'ll",
 'when',
 'he',
 'think',
 'haha',
 'now',
 '...',
 'this',
 'about',
 'did',
 'out',
 'want',
 'going',
 'lol',
 'or',
 'go',
 'they',
 'what',
 'all',
 '(',
 'one',
 'then',
 "'re",
 'no',
 'will',
 'see',
 'got',
 'well',
 'how',
 'time',
 'too',
 'she',
 'there',
 'na',
 'good',
 'need',
 'her',
 'its',
 'them',
 'u',
 'had',
 'really',
 'from',
 'sorry',
 'still',
 'because',
 'back',
 'should',
 'were',
 'hey',
 'would',
 'yeah',
 'some',
 'am',
 'work',
 'could',
 'oh',
 'right',
 'an',
 'as',
 'after',
 'gon',
 'let',
 'actually',
 'tonight',
 'him',
 'by',
 'been',
 'text',
 'love',
 'doing',
 'over',
 'before',
 'also',
 'come',
 'much',
 'okay',
 '

In [160]:
len(cc.keys())

66

In [161]:
len(le.lookup)

4512

In [164]:
for c, c_index in index_to_category.items():
    category_candidates[c] = [(index_to_words[i[0]], i[1]) for i in sorted(enumerate(distance_matrix[c_index]), key=lambda x:x[1])]

In [165]:
category_candidates['sad'][:5]

[('miss', 0.62685165970686296),
 ('sad', 0.73971301268423928),
 ('missed', 0.75487823688334332),
 ('5.15', 0.77258157950474637),
 ("16'3", 0.77258157950474637)]

In [167]:
category_candidates['sad'][-5:]

[('?', 1.2390867432696371),
 ('i', 1.2398045045550514),
 ('the', 1.2413339681241478),
 ('.', 1.245509421755052),
 ('to', 1.2536014689647432)]

In [170]:
all_words_lookup.items()[:5]

[('raining', 0), ('yellow', 1), ('hhha', 2), ('four', 3), ('hanging', 4)]

In [184]:
cc_normalized = {c: [] for c in category_candidates.keys()}
category_frequency = {w: 0 for w in all_words_lookup.keys()}

for c, vec in category_candidates.items():
    max_dist = max([v[1] for v in vec])
    for v in vec:
        category_frequency[v[0]] += v[1]
        cc_normalized[c].append((v[0], max_dist - v[1]))
        
#cc_icf = {c : [(v[1] / float(category_frequency[v[0]])) for v in vec] for c, vec in cc_normalized.items()}
cc_icf = {}
for c, vec in cc_normalized.items():
    cc_icf_vec = sorted([(v[0], (v[1] / float(category_frequency[v[0]]))) for v in vec], key=lambda x: x[1], reverse=True)
    cc_icf[c] = cc_icf_vec

In [185]:
cc_icf['sad'][:25]

[('miss', 0.0097136387180436577),
 ('sad', 0.0078361456873306898),
 ('missed', 0.0076954124130836996),
 ('5.15', 0.0071143181979391335),
 ("16'3", 0.0071143181979391335),
 ('4.95m', 0.0071143181979391335),
 ('had16', 0.0071143181979391335),
 ('lameeee', 0.0069721087441451321),
 ('vaulted', 0.0068389583924758609),
 ('sobbing', 0.0068353679133915275),
 ('broke', 0.0067480572914038247),
 ('height', 0.0067353060893045611),
 ('missing', 0.0066383554791741026),
 ('hurt', 0.006591805293096358),
 ('lost', 0.0065817352087225705),
 ('sooooo', 0.0064936568187729908),
 ('failing', 0.0064548546567908026),
 ('teammates', 0.0063264059623899475),
 ('failed', 0.0061993333069009122),
 ('xtr', 0.0061878070019371094),
 ('lameo', 0.0061825449671749537),
 ('hurts', 0.00617812602239783),
 ('alone', 0.0061696979251596772),
 ('fail', 0.0060899062918634075),
 ('phone..soo', 0.0060890893063182596)]

In [186]:
cc_icf['body'][:25]

[('brutal', 0.0098298917655459794),
 ('shoulers', 0.0098298917655459794),
 ('hamstring', 0.0098298917655459794),
 ('secretly', 0.0098298917655459794),
 ('thighs', 0.0098298917655459794),
 ('bis', 0.0098298917655459794),
 ('tris', 0.0098298917655459794),
 ('shit', 0.0096119199442476332),
 ('abs', 0.0093604799553028608),
 ('head', 0.0091133979135031642),
 ('ankle', 0.0090955673176145498),
 ('ass', 0.009079217702347684),
 ('9:15', 0.0089457194094229291),
 ('hand', 0.0089296328222851357),
 ('lifeeee', 0.008924940601930418),
 ('ke', 0.008924940601930418),
 ('supported', 0.008924940601930418),
 ('bone', 0.0088313397094219124),
 ('asleep', 0.0088182605420020904),
 ('supposedly', 0.0087146567527200819),
 ('skin', 0.0087011809294572102),
 ('gustopher', 0.0085240043491972336),
 ('yaaaaaaaaaay', 0.0084837668420021956),
 ('bekah', 0.0084837668420021956),
 ('face', 0.0084830593114838971)]

In [None]:
namesToKeys

In [187]:
cc_icf.keys()

['body',
 'they',
 'pronoun',
 'funct',
 'feel',
 'money',
 'insight',
 'humans',
 'tentat',
 'sad',
 'past',
 'negate',
 'see',
 'affect',
 'anger',
 'home',
 'conj',
 'sexual',
 'negemo',
 'ppron',
 'inhib',
 'family',
 'adverb',
 'space',
 'cornell',
 'ipron',
 'anx',
 'preps',
 'percept',
 'swear',
 'quant',
 'location',
 'certain',
 'relativ',
 'health',
 'you',
 'incl',
 'friend',
 'relig',
 'cogmech',
 'we',
 'time',
 'assent',
 'discrep',
 'leisure',
 'number',
 'verb',
 'hear',
 'bio',
 'article',
 'excl',
 'present',
 'death',
 'i',
 'cause',
 'work',
 'ingest',
 'motion',
 'filler',
 'nonfl',
 'future',
 'achieve',
 'posemo',
 'auxverb',
 'shehe',
 'social']

## Money

In [188]:
cc_icf['money']

[('money', 0.00773873347046641),
 ('check', 0.0076277929692872915),
 ('1300', 0.0074224355076645153),
 ('buy', 0.0073780512844989384),
 ('free', 0.0072909074554109443),
 ('op', 0.0071958620667783258),
 ('fee', 0.0071942726738699102),
 ('brokers', 0.0071942726738699102),
 ('pay', 0.0071377294420639894),
 ('dues', 0.0071360418410502148),
 ('sigh', 0.0070970148894543292),
 ('bought', 0.0070854886110416236),
 ('coloring', 0.0068913277455929399),
 ('dollar', 0.0068493247626164461),
 ('financial', 0.0067911884997076868),
 ('accounting', 0.0067911884997076868),
 ('store', 0.0067874707768989682),
 ('bombed', 0.0067834243177240291),
 ('worth', 0.0067517059666141969),
 ('negotiate', 0.0066305447895964398),
 ('stressed', 0.0066157378097102234),
 ('successful', 0.0065693353719510151),
 ('bucks', 0.0065472370012089045),
 ('paid', 0.0065287516858771972),
 ('bank', 0.0065023307320771738),
 ('expenses', 0.0064815159965135533),
 ('value', 0.0064628664494931999),
 ('pottery', 0.0064628664494931999),
 ('

In [191]:
le.categories[namesToKeys['money']]['words']

['account*',
 '^account.*$',
 'atm',
 '^atm$',
 'atms',
 '^atms$',
 'auction*',
 '^auction.*$',
 'audit',
 '^audit$',
 'audited',
 '^audited$',
 'auditing',
 '^auditing$',
 'auditor',
 '^auditor$',
 'auditors',
 '^auditors$',
 'audits',
 '^audits$',
 'bank*',
 '^bank.*$',
 'bargain*',
 '^bargain.*$',
 'beggar*',
 '^beggar.*$',
 'begging',
 '^begging$',
 'bet',
 '^bet$',
 'bets',
 '^bets$',
 'betting',
 '^betting$',
 'bill',
 '^bill$',
 'billed',
 '^billed$',
 'billing*',
 '^billing.*$',
 'bills',
 '^bills$',
 'bonus*',
 '^bonus.*$',
 'borrow*',
 '^borrow.*$',
 'bought',
 '^bought$',
 'broker*',
 '^broker.*$',
 'buck',
 '^buck$',
 'bucks',
 '^bucks$',
 'budget*',
 '^budget.*$',
 'business*',
 '^business.*$',
 'buy*',
 '^buy.*$',
 'cash*',
 '^cash.*$',
 'casino*',
 '^casino.*$',
 'cent',
 '^cent$',
 'cents',
 '^cents$',
 'charit*',
 '^charit.*$',
 'cheap*',
 '^cheap.*$',
 'check',
 '^check$',
 'checking',
 '^checking$',
 'checks',
 '^checks$',
 'chequ*',
 '^chequ.*$',
 'coin',
 '^coin$',

#Body

In [192]:
cc_icf['body']

[('brutal', 0.0098298917655459794),
 ('shoulers', 0.0098298917655459794),
 ('hamstring', 0.0098298917655459794),
 ('secretly', 0.0098298917655459794),
 ('thighs', 0.0098298917655459794),
 ('bis', 0.0098298917655459794),
 ('tris', 0.0098298917655459794),
 ('shit', 0.0096119199442476332),
 ('abs', 0.0093604799553028608),
 ('head', 0.0091133979135031642),
 ('ankle', 0.0090955673176145498),
 ('ass', 0.009079217702347684),
 ('9:15', 0.0089457194094229291),
 ('hand', 0.0089296328222851357),
 ('lifeeee', 0.008924940601930418),
 ('ke', 0.008924940601930418),
 ('supported', 0.008924940601930418),
 ('bone', 0.0088313397094219124),
 ('asleep', 0.0088182605420020904),
 ('supposedly', 0.0087146567527200819),
 ('skin', 0.0087011809294572102),
 ('gustopher', 0.0085240043491972336),
 ('yaaaaaaaaaay', 0.0084837668420021956),
 ('bekah', 0.0084837668420021956),
 ('face', 0.0084830593114838971),
 ('scholl', 0.0084667824529652599),
 ('stickies', 0.0084667824529652599),
 ('sore', 0.0084505806851354615),
 ('

In [193]:
le.categories[namesToKeys['body']]['words']

['abdomen*',
 '^abdomen.*$',
 'abs',
 '^abs$',
 'anal',
 '^anal$',
 'ankle*',
 '^ankle.*$',
 'anus*',
 '^anus.*$',
 'appendix',
 '^appendix$',
 'arch',
 '^arch$',
 'arm',
 '^arm$',
 'armpit*',
 '^armpit.*$',
 'arms*',
 '^arms.*$',
 'arous*',
 '^arous.*$',
 'arse',
 '^arse$',
 'arses',
 '^arses$',
 'arter*',
 '^arter.*$',
 'asleep',
 '^asleep$',
 'ass',
 '^ass$',
 'asses',
 '^asses$',
 'bald',
 '^bald$',
 'bellies',
 '^bellies$',
 'belly',
 '^belly$',
 'bicep*',
 '^bicep.*$',
 'bladder*',
 '^bladder.*$',
 'blood',
 '^blood$',
 'bloody',
 '^bloody$',
 'bodi*',
 '^bodi.*$',
 'body*',
 '^body.*$',
 'bone',
 '^bone$',
 'bones',
 '^bones$',
 'bony',
 '^bony$',
 'boob*',
 '^boob.*$',
 'bowel*',
 '^bowel.*$',
 'brain*',
 '^brain.*$',
 'breast*',
 '^breast.*$',
 'breath*',
 '^breath.*$',
 'butt',
 '^butt$',
 "butt's",
 "^butt's$",
 'butts',
 '^butts$',
 'cheek*',
 '^cheek.*$',
 'chest*',
 '^chest.*$',
 'clothes',
 '^clothes$',
 'cock',
 '^cock$',
 'cocks*',
 '^cocks.*$',
 'colon',
 '^colon$',
 

#Anger

In [194]:
cc_icf['anger']

[('shit', 0.0099926939652960744),
 ('fuck', 0.0095039543898683965),
 ('9:15', 0.0092527943065463976),
 ('fuckin', 0.0091859863715130427),
 ('promotions', 0.0091351772423263634),
 ('sucks', 0.0090661681910400391),
 ('fucking', 0.0090132894579978504),
 ('raped', 0.0089610494715875576),
 ('hate', 0.0088889758690060463),
 ('channel', 0.0088751300489942914),
 ('gustopher', 0.0087723802513769624),
 ('yaaaaaaaaaay', 0.0087597607445972052),
 ('bekah', 0.0087597607445972052),
 ('resenting', 0.0087153774658017477),
 ('message..', 0.008708768918299593),
 ('moing', 0.0086979736975382471),
 ('notifies', 0.008640403483632899),
 ('damnnn', 0.0086401965991414094),
 ('breakdown', 0.008594725337060613),
 ('evidence', 0.008594725337060613),
 ("'jokingly", 0.008594725337060613),
 ('accusatory', 0.008594725337060613),
 ("'well", 0.008594725337060613),
 ('sarcastically', 0.0085816050500149496),
 ('bitchy', 0.0085816050500149496),
 ('bug', 0.0085625035211266687),
 ('doodle', 0.0085625035211266687),
 ('ahhsjs

In [195]:
le.categories[namesToKeys['anger']]['words']

['abuse*',
 '^abuse.*$',
 'abusi*',
 '^abusi.*$',
 'aggravat*',
 '^aggravat.*$',
 'aggress*',
 '^aggress.*$',
 'agitat*',
 '^agitat.*$',
 'anger*',
 '^anger.*$',
 'angr*',
 '^angr.*$',
 'annoy*',
 '^annoy.*$',
 'antagoni*',
 '^antagoni.*$',
 'argh*',
 '^argh.*$',
 'argu*',
 '^argu.*$',
 'arrogan*',
 '^arrogan.*$',
 'assault*',
 '^assault.*$',
 'asshole*',
 '^asshole.*$',
 'attack*',
 '^attack.*$',
 'bastard*',
 '^bastard.*$',
 'battl*',
 '^battl.*$',
 'beaten',
 '^beaten$',
 'bitch*',
 '^bitch.*$',
 'bitter*',
 '^bitter.*$',
 'blam*',
 '^blam.*$',
 'bother*',
 '^bother.*$',
 'brutal*',
 '^brutal.*$',
 'cheat*',
 '^cheat.*$',
 'confront*',
 '^confront.*$',
 'contempt*',
 '^contempt.*$',
 'contradic*',
 '^contradic.*$',
 'crap',
 '^crap$',
 'crappy',
 '^crappy$',
 'critical',
 '^critical$',
 'critici*',
 '^critici.*$',
 'crude*',
 '^crude.*$',
 'cruel*',
 '^cruel.*$',
 'cunt*',
 '^cunt.*$',
 'cut',
 '^cut$',
 'cynic',
 '^cynic$',
 'damn*',
 '^damn.*$',
 'danger*',
 '^danger.*$',
 'defenc

# Leisure

In [196]:
cc_icf['leisure']

[('episodez', 0.0096781177720724806),
 ('isaac', 0.0096781177720724806),
 ('curre.tly', 0.0096781177720724806),
 ('videos', 0.0096781177720724806),
 ("'binding", 0.0096781177720724806),
 ('watcbk.g', 0.0096781177720724806),
 ('watxhed', 0.0096781177720724806),
 ('orga.ized', 0.0096781177720724806),
 ('simpsons', 0.0095884669799890798),
 ('baha', 0.009516665348953509),
 ('pc', 0.0093103535980308579),
 ('played', 0.0090233978223818977),
 ('games', 0.0090223666425162628),
 ('football', 0.0089515888150627134),
 ('game', 0.0088006290783450326),
 ('downloaded', 0.0084641826255654352),
 ('video', 0.008310322558899479),
 ('9:15', 0.0080522266002257831),
 ('fro', 0.0080285574005011431),
 ('b.', 0.0080285574005011431),
 ('bcuz', 0.0080285574005011431),
 ('.500', 0.0080285574005011431),
 ('nething', 0.0080285574005011431),
 ('party', 0.0079145151023525765),
 ('probaby', 0.0077926680210398738),
 ('playing', 0.0077748115061330976),
 ('iggles', 0.0077179121583806692),
 ('9-1.', 0.007712937239411884)

In [197]:
le.categories[namesToKeys['leisure']]['words']

['actor*',
 '^actor.*$',
 'actress*',
 '^actress.*$',
 'aerobic*',
 '^aerobic.*$',
 'amus*',
 '^amus.*$',
 'apartment*',
 '^apartment.*$',
 'art',
 '^art$',
 'artist*',
 '^artist.*$',
 'arts',
 '^arts$',
 'athletic*',
 '^athletic.*$',
 'ball',
 '^ball$',
 'ballet*',
 '^ballet.*$',
 'band',
 '^band$',
 'bands',
 '^bands$',
 'bar',
 '^bar$',
 'bars',
 '^bars$',
 'baseball*',
 '^baseball.*$',
 'basketball*',
 '^basketball.*$',
 'bath*',
 '^bath.*$',
 'beach*',
 '^beach.*$',
 'beer*',
 '^beer.*$',
 'bicyc*',
 '^bicyc.*$',
 'bike*',
 '^bike.*$',
 'birdie*',
 '^birdie.*$',
 'blackjack',
 '^blackjack$',
 'blockbuster*',
 '^blockbuster.*$',
 'blog*',
 '^blog.*$',
 'book*',
 '^book.*$',
 'camping',
 '^camping$',
 'cards',
 '^cards$',
 'casino*',
 '^casino.*$',
 'casual',
 '^casual$',
 'cd*',
 '^cd.*$',
 'celebrat*',
 '^celebrat.*$',
 'celebrit*',
 '^celebrit.*$',
 'channel*',
 '^channel.*$',
 'chat*',
 '^chat.*$',
 'checkers',
 '^checkers$',
 'chess',
 '^chess$',
 'chillin*',
 '^chillin.*$',
 '

#Achieve

In [198]:
cc_icf['achieve']

[('work', 0.0082140573568844997),
 ('better', 0.0075685915363349759),
 ('win', 0.0075045989437557449),
 ('first', 0.007111873902821083),
 ('would.not', 0.0070129030955199305),
 ('try', 0.0068203904094398787),
 ('working', 0.0067977700057570388),
 ('best', 0.006730268569382439),
 ('plan', 0.0066025576447518849),
 ('chilled', 0.0065515331175453268),
 ('perfect', 0.006445885625460235),
 ('sickness', 0.0064388877325198728),
 ("'mohawk", 0.0064388877325198728),
 ('upswing', 0.0064388877325198728),
 ('pointed', 0.0064388877325198728),
 ('wise', 0.0064388877325198728),
 ('winners', 0.0064297190772062114),
 ('established', 0.006429266582601431),
 ('aspect', 0.006429266582601431),
 ('severing', 0.006429266582601431),
 ('reputation', 0.006429266582601431),
 ('out..we', 0.0064097813414281469),
 ('winning', 0.0063472523172134034),
 ('wolfram', 0.0063208563346644937),
 ('writeup', 0.0063208563346644937),
 ('9:15', 0.0063129967452419285),
 ('nomination', 0.0063117771283158156),
 ('quitting', 0.00630

In [199]:
le.categories[namesToKeys['achieve']]['words']

['abilit*',
 '^abilit.*$',
 'able*',
 '^able.*$',
 'accomplish*',
 '^accomplish.*$',
 'ace',
 '^ace$',
 'achiev*',
 '^achiev.*$',
 'acquir*',
 '^acquir.*$',
 'acquisition*',
 '^acquisition.*$',
 'adequa*',
 '^adequa.*$',
 'advanc*',
 '^advanc.*$',
 'advantag*',
 '^advantag.*$',
 'ahead',
 '^ahead$',
 'ambiti*',
 '^ambiti.*$',
 'approv*',
 '^approv.*$',
 'attain*',
 '^attain.*$',
 'attempt*',
 '^attempt.*$',
 'authorit*',
 '^authorit.*$',
 'award*',
 '^award.*$',
 'beat',
 '^beat$',
 'beaten',
 '^beaten$',
 'best',
 '^best$',
 'better',
 '^better$',
 'bonus*',
 '^bonus.*$',
 'burnout*',
 '^burnout.*$',
 'capab*',
 '^capab.*$',
 'celebrat*',
 '^celebrat.*$',
 'challeng*',
 '^challeng.*$',
 'champ*',
 '^champ.*$',
 'climb*',
 '^climb.*$',
 'closure',
 '^closure$',
 'compet*',
 '^compet.*$',
 'conclud*',
 '^conclud.*$',
 'conclus*',
 '^conclus.*$',
 'confidence',
 '^confidence$',
 'confident',
 '^confident$',
 'confidently',
 '^confidently$',
 'conquer*',
 '^conquer.*$',
 'conscientious*',

#Ingest

In [200]:
cc_icf['ingest']

[('dinner', 0.0087060195780057168),
 ('eat', 0.0083571150904732504),
 ('eating', 0.0078103618430842431),
 ('food', 0.0077861756716665247),
 ('ate', 0.0077761468056158764),
 ('sprinkle', 0.0071111919342471807),
 ('cools', 0.0071111919342471807),
 ('littler', 0.0071111919342471807),
 ('wine', 0.0071107352543024593),
 ('drinking', 0.0069926981915644591),
 ('cookies', 0.0069586302665791064),
 ('tastes', 0.0069309147825315246),
 ('fry', 0.0068723572488502407),
 ('lunch', 0.0068661622464715232),
 ('sugar', 0.0067986884216860452),
 ('kfc', 0.0067839419951300581),
 ('breaded', 0.0067839419951300581),
 ('seasoning', 0.0067839419951300581),
 ('fried', 0.0067839419951300581),
 ('frying', 0.0067839419951300581),
 ('taste', 0.0067839419951300581),
 ('pizza', 0.0067717184049153611),
 ('stomach', 0.0067366045493829317),
 ('drunk', 0.0067330064256810197),
 ('henny', 0.0067163708730088728),
 ('9-1.', 0.0066664714755275281),
 ('hungry', 0.0066286690479662874),
 ('cooks', 0.0066122520997899861),
 ('lunha

In [201]:
le.categories[namesToKeys['ingest']]['words']

['alcohol*',
 '^alcohol.*$',
 'anorexi*',
 '^anorexi.*$',
 'appeti*',
 '^appeti.*$',
 'ate',
 '^ate$',
 'bake*',
 '^bake.*$',
 'baking',
 '^baking$',
 'bar',
 '^bar$',
 'bars',
 '^bars$',
 'beer*',
 '^beer.*$',
 'binge*',
 '^binge.*$',
 'binging',
 '^binging$',
 'boil*',
 '^boil.*$',
 'booz*',
 '^booz.*$',
 'bread',
 '^bread$',
 'breakfast*',
 '^breakfast.*$',
 'brunch*',
 '^brunch.*$',
 'bulimi*',
 '^bulimi.*$',
 'caf*',
 '^caf.*$',
 'candie*',
 '^candie.*$',
 'candy',
 '^candy$',
 'chew*',
 '^chew.*$',
 'chow*',
 '^chow.*$',
 'cigar*',
 '^cigar.*$',
 'cocktail*',
 '^cocktail.*$',
 'coffee*',
 '^coffee.*$',
 'coke*',
 '^coke.*$',
 'cook*',
 '^cook.*$',
 'dessert*',
 '^dessert.*$',
 'diet*',
 '^diet.*$',
 'digest*',
 '^digest.*$',
 'dine',
 '^dine$',
 'dined',
 '^dined$',
 'diner',
 '^diner$',
 'diners',
 '^diners$',
 'dines',
 '^dines$',
 'dining',
 '^dining$',
 'dinner*',
 '^dinner.*$',
 'dish',
 '^dish$',
 'dishes',
 '^dishes$',
 'drank',
 '^drank$',
 'drink*',
 '^drink.*$',
 'drunk

In [202]:
pickle.dump(cc_icf, open('cc_icf.pickle', 'w'))

In [203]:
len(all_words_lookup)

9933

In [204]:
len(all_tokens)

147372

In [205]:
len(unique_words)

NameError: name 'unique_words' is not defined