# Import and Instantiate LIWC

In [79]:
import liwcExtractor as liwcEx
le = liwcEx.liwcExtractor(liwcPath=liwcEx.liwcPath)
liwcDict = le.dictionary

In [80]:
import numpy as np
from nltk import FreqDist
from nltk import word_tokenize as wt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances    

In [83]:
'''Potential class for the LIWC++'''
class liwcPlus():
    #le = None
    #corpus = None
    #tokens = []
    #freqdist = None
    #corpus2corpus = None
    #liwc2corpus = None
    #corpusLookup = None
    #corpusLookupRev = None
    #scaler = None
    #scaled_c2c = None
    #cToN = None # categories to number map
    #categories_to_wordind = None
    def __init__(self, liwcExtractor, corpus):
        print "Tokenizing corpus and getting frequencies..."
        self.tokens = []
        self.corpus = corpus
        self.categories = {}

        self.getFrequencies()
        self.corpus2corpus = np.zeros((len(self.freqdist), len(self.freqdist)))
        self.corpusLookup = {word : index for index, word in enumerate(self.freqdist.keys())}
        self.corpusLookupRev = {index: word for word, index in self.corpusLookup.items()}
        
        print "getting co-occurrence from documents"
        for document in corpus:
            self.update_corpus2corpus(document)
        self.scaler = StandardScaler()
        self.scaled_c2c = self.scaler.fit_transform(self.corpus2corpus)
        
        print "Setting up LIWC"
        self.le = liwcExtractor
        self.cToN = self.le.mapCategoriesToNumbers
        
        print "Computing co-occurence matrices with LIWC"
        self.categories_to_wordind = {c['name'] : [] for c in self.le.categories.values()}
        for regex, r_indeces in self.le.lookup:
            for word, index in self.corpusLookup.items():
                if len(regex.findall(word)) > 0:
                    for ri in r_indeces:
                        self.categories_to_wordind[self.le.categories[self.cToN[ri]]['name']].append(index)
        
        self.category_mean_vectors = {}
        for c, word_indeces in self.categories_to_wordind.items():
            #rows_in_category = [scaled_c2c[i] for i in word_indeces]
            rows_in_category = []
            for i in word_indeces:
                rows_in_category.append(self.scaled_c2c[i])
            if np.sum(rows_in_category) == 0:
                self.category_mean_vectors[c] = np.zeros(len(self.corpus2corpus[0]))
            else:
                mean_rows = np.mean(rows_in_category, axis=0)
                self.category_mean_vectors[c] = mean_rows
        
        print "Sorting words by co-occurrence similarity"
        self.category_to_index = {c : i for c, i in enumerate(self.category_mean_vectors.keys())}
        self.index_to_category = {i : c for c, i in self.category_to_index.items()}

        self.category_mean_matrix = np.array([vector for vector in self.category_mean_vectors.values()])
        self.distance_matrix = pairwise_distances(self.category_mean_matrix, Y=self.scaled_c2c, metric='cosine')
        
        #for c, c_index in self.index_to_category.items():
        #    self.categories[c] = [self.getIndex(i[0]) for i in sorted(enumerate(self.distance_matrix[c_index]), key=lambda x:x[1])]
        
        for c, c_index in self.index_to_category.items():
            self.categories[c] = [(self.getIndex(i[0]), i[1]) for i in sorted(enumerate(self.distance_matrix[c_index]), key=lambda x:x[1])]
            
    def normalize_candidates(self):
        cc_normalized = {c: [] for c in self.categories.keys()}
        category_frequency = {w: 0 for w in self.corpusLookup.keys()}

        for c, vec in self.categories.items():
            max_dist = max([v[1] for v in vec])
            for v in vec:
                category_frequency[v[0]] += v[1]
                cc_normalized[c].append((v[0], max_dist - v[1]))

        #cc_icf = {c : [(v[1] / float(category_frequency[v[0]])) for v in vec] for c, vec in cc_normalized.items()}
        self.cc_icf = {}
        for c, vec in cc_normalized.items():
            self.cc_icf_vec = sorted([(v[0], (v[1] / float(category_frequency[v[0]]))) for v in vec], key=lambda x: x[1], reverse=True)
            self.cc_icf[c] = self.cc_icf_vec
        
    def normalize(self, token):
        t = token.lower()
        # add other normalization steps here
        for a in t:
            if not a.isalpha():
                return False
        return t
    
    def update_corpus2corpus(self,document):
        norm_doc_tokens = [self.normalize(t) for t in wt(document) if self.normalize(t)]
        if len(norm_doc_tokens) > 4:
            doc_indeces = [self.corpusLookup[t] for t in norm_doc_tokens if t]
            for t in norm_doc_tokens:
                t_index = self.corpusLookup[t]
                for a in doc_indeces:
                    self.corpus2corpus[t_index][a] += 1
                                    
        
    def getFrequencies(self):
        for document in self.corpus:
            tokens = wt(document)
            for t in tokens:
                nt = self.normalize(t)
                if nt: # could return none if it is a token we don't want
                    self.tokens.append(nt)
        self.freqdist = FreqDist(self.tokens)
    
    def getIndex(self,ind):
        if type(ind) == type(''):
            return self.corpusLookup[ind]
        elif type(ind) == type(3):
            return self.corpusLookupRev[ind]
        else:
            print "index must be string or integer"
    
    def getCategory(self, category_title, normalized, mode='only_expanded'):
        if normalized:
            if mode == 'only_expanded':
                words = [word for word in self.cc_icf[category_title] if self.getIndex(word[0]) not in self.categories_to_wordind[category_title]]
                return words
            elif mode == 'with_expanded':
                return self.cc_icf[category_title]
            elif mode == 'liwc_only':
                words = [word for word in self.cc_icf[category_title] if self.getIndex(word[0]) in self.categories_to_wordind[category_title]]
                return words
        else:
            if mode == 'only_expanded':
                words = [word for word in self.categories[category_title] if self.getIndex(word[0]) not in self.categories_to_wordind[category_title]]
                return words
            elif mode == 'with_expanded':
                return self.categories[category_title]
            elif mode == 'liwc_only':
                words = [word for word in self.categories[category_title] if self.getIndex(word[0]) in self.categories_to_wordind[category_title]]
                return words

In [437]:
test = ["This might be a script.1 12 34", "This might be another script"]

In [438]:
lp = liwcPlus(le, test)

Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity


In [439]:
lp.normalize_candidates()

In [440]:
lp.categories['sad']

[('a', 1.0),
 ('be', 1.0),
 ('script', 1.0),
 ('this', 1.0),
 ('another', 1.0),
 ('might', 1.0)]

# Live Run

## load files

In [322]:
pwd

u'/Users/scottcambo/Documents/grad_school/text_analytics/final_project/final_project_from_spring2015/liwcExtractor'

In [323]:
ls ../data/dialogue_files/

animation2.txt  drama2.txt      horror2.txt     sci-fi2.txt
biography2.txt  family2.txt     musical2.txt    sport2.txt
comedy2.txt     film-noir2.txt  mystery2.txt    thriller2.txt
crime2.txt      history2.txt    romance2.txt    war2.txt


In [76]:
import os

In [77]:
genres = {}
path = '../data/dialogue_files/'
filenames = os.listdir(path)
for fn in filenames:
    with open(path + fn) as fileIn:
        genres[fn.split('2')[0]] = [f.strip() for f in fileIn.readlines()]

In [78]:
for genre in genres.items():
    print "%s : %s" % (genre[0], len(genre[1]))

mystery : 31453
romance : 19673
sport : 12844
sci-fi : 3476
family : 2122
horror : 22777
film-noir : 2790
crime : 13825
drama : 28860
animation : 6780
thriller : 17870
comedy : 23752
war : 10986
musical : 3056
biography : 27511
history : 11997


## Compute and Write Everything Out

In [84]:
comedy_lp = liwcPlus(le, genres['comedy'])

Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity


In [91]:
comedy_lp.normalize_candidates()

In [93]:
comedy_lp.getCategory('posemo', True)

[('sonavabitch', 0.0057428655923764081),
 ('dell', 0.0057428655923764081),
 ('uncertain', 0.0057428655923764081),
 ('habrayshit', 0.0057428655923764081),
 ('resilient', 0.0057428655923764081),
 ('jasmine', 0.0057428655923764081),
 ('moonrise', 0.0057428655923764081),
 ('demerit', 0.0057428655923764081),
 ('aaaaaaah', 0.0057428655923764081),
 ('concur', 0.0057428655923764081),
 ('wopat', 0.0057428655923764081),
 ('tangy', 0.0057428655923764081),
 ('pretzel', 0.0057428655923764081),
 ('claws', 0.0057428655923764081),
 ('smeller', 0.0057428655923764081),
 ('davy', 0.0057428655923764081),
 ('obeying', 0.0057428655923764081),
 ('campout', 0.0057428655923764081),
 ('eyebrows', 0.0057428655923764081),
 ('javelina', 0.0057428655923764081),
 ('growling', 0.0057428655923764081),
 ('syphilis', 0.0057428655923764081),
 ('umm', 0.0057428655923764081),
 ('urgh', 0.0057428655923764081),
 ('verheiden', 0.0057428655923764081),
 ('brundesbragonester', 0.0057428655923764081),
 ('ehhh', 0.0057428655923764

In [94]:
comedy_lp.getCategory('negemo', True)

[('sonavabitch', 0.017568260821183224),
 ('dell', 0.017568260821183224),
 ('habrayshit', 0.017568260821183224),
 ('resilient', 0.017568260821183224),
 ('jasmine', 0.017568260821183224),
 ('moonrise', 0.017568260821183224),
 ('demerit', 0.017568260821183224),
 ('aaaaaaah', 0.017568260821183224),
 ('concur', 0.017568260821183224),
 ('wopat', 0.017568260821183224),
 ('tangy', 0.017568260821183224),
 ('pretzel', 0.017568260821183224),
 ('claws', 0.017568260821183224),
 ('smeller', 0.017568260821183224),
 ('davy', 0.017568260821183224),
 ('obeying', 0.017568260821183224),
 ('campout', 0.017568260821183224),
 ('eyebrows', 0.017568260821183224),
 ('javelina', 0.017568260821183224),
 ('growling', 0.017568260821183224),
 ('syphilis', 0.017568260821183224),
 ('umm', 0.017568260821183224),
 ('urgh', 0.017568260821183224),
 ('verheiden', 0.017568260821183224),
 ('brundesbragonester', 0.017568260821183224),
 ('ehhh', 0.017568260821183224),
 ('sniffer', 0.017568260821183224),
 ('goffs', 0.0175682608

In [465]:
for genre, dialogue in genres.items():
    print "Cranking on %s" % genre
    print "--------------"
    lp = liwcPlus(le, dialogue)
    lp.normalize_candidates()
    with open(genre+".data",'w') as fileOut:
        for liwc_cat in lp.categories.keys():
            only_expanded = lp.getCategory(liwc_cat, True, mode="only_expanded")
            liwc_only = lp.getCategory(liwc_cat, True, mode='liwc_only')
            fileOut.write('%'+liwc_cat+'\n')
            for word in only_expanded:
                fileOut.write(word[0]+','+str(word[1])+'\n')
            fileOut.write('\n')
            for word in liwc_only:
                fileOut.write(word[0]+','+str(word[1])+'\n')
            fileOut.write('\n')

Cranking on mystery
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on romance
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on sport
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on sci-fi
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on family
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Com

In [465]:
for genre, dialogue in genres.items():
    print "Cranking on %s" % genre
    print "--------------"
    lp = liwcPlus(le, dialogue)
    lp.normalize_candidates()
    with open(genre+".data",'w') as fileOut:
        for liwc_cat in lp.categories.keys():
            only_expanded = lp.getCategory(liwc_cat, True, mode="only_expanded")
            liwc_only = lp.getCategory(liwc_cat, True, mode='liwc_only')
            fileOut.write('%'+liwc_cat+'\n')
            for word in only_expanded:
                fileOut.write(word[0]+','+str(word[1])+'\n')
            fileOut.write('\n')
            for word in liwc_only:
                fileOut.write(word[0]+','+str(word[1])+'\n')
            fileOut.write('\n')

Cranking on mystery
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on romance
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on sport
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on sci-fi
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Computing co-occurence matrices with LIWC
Sorting words by co-occurrence similarity
Cranking on family
--------------
Tokenizing corpus and getting frequencies...
getting co-occurrence from documents
Setting up LIWC
Com

In [459]:
ls ./*.data

./animation.data  ./drama.data      ./horror.data     ./sci-fi.data
./biography.data  ./family.data     ./musical.data    ./sport.data
./comedy.data     ./film-noir.data  ./mystery.data    ./thriller.data
./crime.data      ./history.data    ./romance.data    ./war.data


In [483]:
cat biography.data | head -n 50

%body
orwell,0.00755736890081
hemorrhoids,0.00755736890081
bedraggled,0.00755736890081
wrinkled,0.00755736890081
bellhop,0.00755736890081
clumps,0.00755736890081
abound,0.00755736890081
repulsed,0.00755736890081
shriveled,0.00755736890081
leaking,0.00755736890081
patches,0.00755736890081
obscuring,0.00755736890081
prune,0.00745099659211
simile,0.00738587489125
flabby,0.00727218704198
searches,0.00708355854445
asshole,0.00696881536801
spots,0.00686079894771
practically,0.0068239527058
canes,0.00680314293248
gimmie,0.00680314293248
porche,0.00680314293248
headquarters,0.00680314293248
beaut,0.00680314293248
ghowaro,0.00680314293248
diction,0.00680314293248
rb,0.00680314293248
stacy,0.00680314293248
chlorine,0.00680314293248
jumpin,0.00680314293248
hah,0.00680314293248
umm,0.00680314293248
urgh,0.00680314293248
repellent,0.00680314293248
sor,0.00680314293248
frazzled,0.00680314293248
zonnie,0.00680314293248
regazzi,0.00680314293248
goffs,0.0068031429

In [2]:
import os

In [3]:
files = [f for f in os.listdir('./') if '.data' in f]

In [4]:
files

['animation.data',
 'biography.data',
 'comedy.data',
 'crime.data',
 'drama.data',
 'family.data',
 'film-noir.data',
 'history.data',
 'horror.data',
 'musical.data',
 'mystery.data',
 'romance.data',
 'sci-fi.data',
 'sport.data',
 'thriller.data',
 'war.data']

In [6]:
genre_data = {}
for f in files:
    with open('./'+f, 'r') as fileIn:
        genre_data[f.split(".")[0]] = fileIn.readlines()

In [7]:
genre_data.keys()

['mystery',
 'romance',
 'sport',
 'sci-fi',
 'family',
 'horror',
 'film-noir',
 'crime',
 'drama',
 'animation',
 'thriller',
 'comedy',
 'war',
 'musical',
 'biography',
 'history']

In [9]:
for genre in genre_data.items():
    print "%s : %s" % (genre[0], len(genre[1]))

mystery : 1066494
romance : 770484
sport : 633996
sci-fi : 275748
family : 213972
horror : 849552
film-noir : 257334
crime : 703560
drama : 1056132
animation : 0
thriller : 740652
comedy : 913110
war : 542718
musical : 302478
biography : 1099362
history : 725934


In [66]:
genre_liwc = {}
for genre in genre_data.items():
    print "computing genre : %s" % genre[0]
    liwc_categories = {}
    liwc_cat_name = None
    only_expanded_data = []
    only_liwc_data = []
    finished_expanded = False
    for line in genre[1]:
        if line[0] == '%': # line denoting that it is a category name
            liwc_cat_name = line[1:].strip()
            #print "category : %s" % liwc_cat_name
        elif len(line) > 3: # line with data
            data_split = line.split(",")
            word = data_split[0]
            distance = float(data_split[1].strip())
            
            if not finished_expanded:
                #print "expanded word"
                only_expanded_data.append((word, distance))
            else:
                #print "liwc word"
                only_liwc_data.append((word, distance))
        elif line == '\n':
            #print "space found"
            if not finished_expanded:
                finished_expanded = True
            else:
                finished_expanded = False
                liwc_categories[liwc_cat_name] = {'expanded' : only_expanded_data,
                                                  'liwc' : only_liwc_data}
                only_liwc_data = []
                only_expanded_data = []
    genre_liwc[genre[0].strip()] = liwc_categories

computing genre : mystery
computing genre : romance
computing genre : sport
computing genre : sci-fi
computing genre : family
computing genre : horror
computing genre : film-noir
computing genre : crime
computing genre : drama
computing genre : animation
computing genre : thriller
computing genre : comedy
computing genre : war
computing genre : musical
computing genre : biography
computing genre : history


In [55]:
genre_liwc.keys()

['mystery',
 'drama',
 'comedy',
 'sci-fi',
 'family',
 'horror',
 'film-noir',
 'crime',
 'romance',
 'musical',
 'animation',
 'biography',
 'sport',
 'war',
 'thriller',
 'history']

In [56]:
genre_liwc['drama'].keys()

['relig',
 'cogmech',
 'family',
 'funct',
 'feel',
 'money',
 'insight',
 'humans',
 'we',
 'sad',
 'past',
 'negate',
 'see',
 'article',
 'anger',
 'home',
 'conj',
 'sexual',
 'negemo',
 'ppron',
 'inhib',
 'pronoun',
 'adverb',
 'space',
 'cornell',
 'ipron',
 'anx',
 'preps',
 'percept',
 'future',
 'quant',
 'location',
 'posemo',
 'certain',
 'relativ',
 'health',
 'you',
 'incl',
 'friend',
 'body',
 'bio',
 'tentat',
 'assent',
 'discrep',
 'leisure',
 'number',
 'verb',
 'hear',
 'they',
 'affect',
 'excl',
 'present',
 'death',
 'i',
 'cause',
 'work',
 'ingest',
 'motion',
 'filler',
 'nonfl',
 'swear',
 'achieve',
 'time',
 'auxverb',
 'shehe',
 'social']

In [72]:
genre_liwc['comedy']['posemo']['expanded'][:10]

[('sonavabitch', 0.00574286559238),
 ('dell', 0.00574286559238),
 ('uncertain', 0.00574286559238),
 ('habrayshit', 0.00574286559238),
 ('resilient', 0.00574286559238),
 ('jasmine', 0.00574286559238),
 ('moonrise', 0.00574286559238),
 ('demerit', 0.00574286559238),
 ('aaaaaaah', 0.00574286559238),
 ('concur', 0.00574286559238)]

In [89]:
genre_liwc['comedy']['negemo']['expanded'][:10]

[('sonavabitch', 0.0175682608212),
 ('dell', 0.0175682608212),
 ('habrayshit', 0.0175682608212),
 ('resilient', 0.0175682608212),
 ('jasmine', 0.0175682608212),
 ('moonrise', 0.0175682608212),
 ('demerit', 0.0175682608212),
 ('aaaaaaah', 0.0175682608212),
 ('concur', 0.0175682608212),
 ('wopat', 0.0175682608212)]

In [67]:
genre_liwc['romance']['negemo']['expanded'][:10]

[('dildoes', 0.0187855871953),
 ('aus', 0.0187855871953),
 ('immature', 0.0187855871953),
 ('aaaaaaah', 0.0187855871953),
 ('oooo', 0.0187855871953),
 ('woody', 0.0187855871953),
 ('revised', 0.0187855871953),
 ('natured', 0.0187855871953),
 ('hola', 0.0187855871953),
 ('carlissima', 0.0187855871953)]

In [68]:
genre_liwc['romance']['sad']['expanded'][:10]

[('immature', 0.0126786505727),
 ('aaaaaaah', 0.0126786505727),
 ('oooo', 0.0126786505727),
 ('woody', 0.0126786505727),
 ('revised', 0.0126786505727),
 ('natured', 0.0126786505727),
 ('hola', 0.0126786505727),
 ('carlissima', 0.0126786505727),
 ('moonrise', 0.0126786505727),
 ('decels', 0.0126786505727)]

In [70]:
genre_liwc['romance']['anger']['expanded'][:10]

[('immature', 0.0141883244555),
 ('aaaaaaah', 0.0141883244555),
 ('oooo', 0.0141883244555),
 ('woody', 0.0141883244555),
 ('revised', 0.0141883244555),
 ('natured', 0.0141883244555),
 ('hola', 0.0141883244555),
 ('carlissima', 0.0141883244555),
 ('moonrise', 0.0141883244555),
 ('decels', 0.0141883244555)]

In [65]:
genre_liwc['romance']['posemo']['liwc'][:10]

[('flirt', 0.00805270026957),
 ('warmly', 0.00805270026957),
 ('gracious', 0.00805270026957),
 ('excitable', 0.00805270026957),
 ('cutey', 0.00805270026957),
 ('splenda', 0.00805270026957),
 ('cheerily', 0.00805270026957),
 ('giggles', 0.00805270026957),
 ('favourites', 0.00805270026957),
 ('courageous', 0.00805270026957)]

In [45]:
genre_liwc['romance']['posemo']['expanded'][:10]

[('immature', 0.00892741674547),
 ('aaaaaaah', 0.00892741674547),
 ('oooo', 0.00892741674547),
 ('woody', 0.00892741674547),
 ('revised', 0.00892741674547),
 ('natured', 0.00892741674547),
 ('hola', 0.00892741674547),
 ('carlissima', 0.00892741674547),
 ('moonrise', 0.00892741674547),
 ('decels', 0.00892741674547)]

In [39]:
genre_liwc['romance']['negemo']['expanded'][:10]

[('immature', 0.00892741674547),
 ('aaaaaaah', 0.00892741674547),
 ('oooo', 0.00892741674547),
 ('woody', 0.00892741674547),
 ('revised', 0.00892741674547),
 ('natured', 0.00892741674547),
 ('hola', 0.00892741674547),
 ('carlissima', 0.00892741674547),
 ('moonrise', 0.00892741674547),
 ('decels', 0.00892741674547)]

In [40]:
genre_liwc['romance']['posemo']['expanded'][:10]

[('immature', 0.00892741674547),
 ('aaaaaaah', 0.00892741674547),
 ('oooo', 0.00892741674547),
 ('woody', 0.00892741674547),
 ('revised', 0.00892741674547),
 ('natured', 0.00892741674547),
 ('hola', 0.00892741674547),
 ('carlissima', 0.00892741674547),
 ('moonrise', 0.00892741674547),
 ('decels', 0.00892741674547)]

In [41]:
genre_liwc['romance']['anger']['expanded'][:10]

[('immature', 0.00892741674547),
 ('aaaaaaah', 0.00892741674547),
 ('oooo', 0.00892741674547),
 ('woody', 0.00892741674547),
 ('revised', 0.00892741674547),
 ('natured', 0.00892741674547),
 ('hola', 0.00892741674547),
 ('carlissima', 0.00892741674547),
 ('moonrise', 0.00892741674547),
 ('decels', 0.00892741674547)]

In [42]:
genre_liwc['drama']['anger']['expanded'][:10]

[('simpson', 0.00759735295187),
 ('constanza', 0.00759735295187),
 ('gynecologist', 0.00759735295187),
 ('mozarella', 0.00759735295187),
 ('porche', 0.00759735295187),
 ('disinterested', 0.00759735295187),
 ('catchy', 0.00759735295187),
 ('morally', 0.00759735295187),
 ('attracts', 0.00759735295187),
 ('murali', 0.00759735295187)]

In [97]:
genre_liwc['drama']['ingest']['expanded'][:40]

[('mozarella', 0.00842399354375),
 ('porche', 0.00842399354375),
 ('disinterested', 0.00842399354375),
 ('catchy', 0.00842399354375),
 ('morally', 0.00842399354375),
 ('attracts', 0.00842399354375),
 ('murali', 0.00842399354375),
 ('pokemon', 0.00842399354375),
 ('ahmed', 0.00842399354375),
 ('twinkies', 0.00842399354375),
 ('charnofksy', 0.00842399354375),
 ('spook', 0.00842399354375),
 ('plath', 0.00842399354375),
 ('abbreviation', 0.00842399354375),
 ('questo', 0.00842399354375),
 ('sizisra', 0.00842399354375),
 ('troublesome', 0.00842399354375),
 ('spit', 0.00842399354375),
 ('spic', 0.00842399354375),
 ('onxy', 0.00842399354375),
 ('meade', 0.00842399354375),
 ('perking', 0.00842399354375),
 ('syphilis', 0.00842399354375),
 ('umm', 0.00842399354375),
 ('sor', 0.00842399354375),
 ('tallises', 0.00842399354375),
 ('entendre', 0.00842399354375),
 ('burch', 0.00842399354375),
 ('olandese', 0.00842399354375),
 ('stabilizers', 0.00842399354375),
 ('grandad', 0.00842399354375),
 ('compli

In [98]:
genre_liwc['horror']['ingest']['expanded'][:40]

[('ortezia', 0.01230440882),
 ('aja', 0.01230440882),
 ('herr', 0.01230440882),
 ('reddy', 0.01230440882),
 ('sweeeeeeneeeeey', 0.01230440882),
 ('dragons', 0.01230440882),
 ('luring', 0.01230440882),
 ('ho', 0.01230440882),
 ('stevens', 0.01230440882),
 ('educating', 0.01230440882),
 ('cyberdyne', 0.01230440882),
 ('handcuffs', 0.01230440882),
 ('underpants', 0.01230440882),
 ('groupie', 0.01230440882),
 ('pretzel', 0.01230440882),
 ('trickster', 0.01230440882),
 ('cult', 0.01230440882),
 ('aaaaaaaaaaaaaaaaaa', 0.01230440882),
 ('doofus', 0.01230440882),
 ('aah', 0.01230440882),
 ('tops', 0.01230440882),
 ('registering', 0.01230440882),
 ('haw', 0.01230440882),
 ('marshall', 0.01230440882),
 ('starrrrrlaaaaaa', 0.01230440882),
 ('verheiden', 0.01230440882),
 ('rechecking', 0.01230440882),
 ('hightailin', 0.01230440882),
 ('hobbies', 0.01230440882),
 ('prevent', 0.01230440882),
 ('wahhhhhh', 0.01230440882),
 ('abortion', 0.01230440882),
 ('mentality', 0.01230440882),
 ('evidently', 0.0

* change distance function
* clean up update_corpus2corpus() method
* with scaling, might not have an effect on cosine similarity vs. euclidian similarity