In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = [ w for w in words if w not in stopWords]
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [2]:
def lex(statement):
    words = []
    for w in word_tokenize(statement):
        if w not in stopWords and w.isalpha():
            words.append(w)
    return words
    return [ w for w in word_tokenize(statement) if w not in stopWords]

In [3]:
lex(data)

['All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 'All',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy']

In [4]:
def parse(lexed_statement,ns=(1,2)):
    out = []
    for n in ns:
        for i in range(len(lexed_statement)):
            out.append(tuple(lexed_statement[i:i+n]))
    return set(tuple(out))

In [5]:
lexed  = lex(data)
parse(lexed)

{('All',),
 ('All', 'work'),
 ('boy',),
 ('boy', 'All'),
 ('dull',),
 ('dull', 'boy'),
 ('jack',),
 ('jack', 'dull'),
 ('makes',),
 ('makes', 'jack'),
 ('play',),
 ('play', 'makes'),
 ('work',),
 ('work', 'play')}

In [8]:
def lexed_and_parsed(statement):
    return parse(lex(statement))

In [17]:
truth_text_mapping = {
    'pants-fire':0,
    'false':0.1,
    'barely-true':0.25,
    'half-true':0.5,
    'mostly-true':0.75,
    'true':1
}
class Statement:
    def __init__(self, body, speaker, value):
        self.body = body
        self.speaker = speaker
        self.value = truth_text_mapping[value]
    
    
    @staticmethod
    def from_row(row):
        return Statement(value=row[1], body=row[2], speaker=row[4])
    
    
    def __repr__(self):
        arg_str =  str(', '.join(['='.join([i[0],repr(i[1])]) for i in vars(self).items()]))
        return "Statement({})".format(arg_str)
    
    
    def __str__(self):
        return repr(self)

In [18]:
import csv
statements = []
with open("../datasets/LIAR/train.tsv") as data_file:
    reader = csv.reader(data_file, delimiter='\t', quotechar='"')
    for row in reader:
        try:
            statements.append(Statement.from_row(row))
        except IndexError:
            print(row,len(row))
            
statements[:5]

[Statement(value=0.1, body='Says the Annies List political group supports third-trimester abortions on demand.', speaker='dwayne-bohac'),
 Statement(value=0.5, body='When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.', speaker='scott-surovell'),
 Statement(value=0.75, body='Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."', speaker='barack-obama'),
 Statement(value=0.1, body='Health care reform legislation is likely to mandate free sex change surgeries.', speaker='blog-posting'),
 Statement(value=0.5, body='The economic turnaround started at the end of my term.', speaker='charlie-crist')]

In [21]:
vocabulary = set()
for s in statements:
    vocabulary.update(lexed_and_parsed(s.body))
len(vocabulary)

85037

In [29]:
word_value_counters = dict()
for s in statements:
    for word_tuple in lexed_and_parsed(s.body):
        if word_tuple not in word_value_counters:
            word_value_counters[word_tuple] = dict()
        counter = word_value_counters[word_tuple]
        counter.update({s.speaker:counter.get(s.speaker,0)+1}) 


[(('dictator', 'Josef'), {'paul-broun': 1}), (('special', 'protection'), {'steve-king': 1}), (('loophole', 'Obama'), {'blog-posting': 1}), (('marijuana', 'alonewill'), {'united-care': 1}), (('working', 'Americans'), {'barack-obama': 1, 'americans-united-change': 1, 'chelsea-clinton': 1}), (('Florida', 'North'), {'chris-christie': 1}), (('make', 'could'), {'keith-olbermann': 1}), (('percent', 'officers'), {'tom-barrett': 1}), (('drilled', 'skulls'), {'people-ethical-treatment-animals': 1}), (('County', 'waterfront'), {'kris-jordan': 1}), (('history',), {'greg-abbott': 1, 'jim-renacci': 1, 'cecile-richards': 1, 'david-simas': 1, 'sherrod-brown': 2, 'johnny-isakson': 1, 'freedom-project': 1, 'paul-ryan': 1, 'sarah-palin': 2, 'one-wisconsin-now': 1, 'tammy-baldwin': 1, 'tim-kaine': 1, 'jeff-fitzgerald': 1, 'seminole-tribe-florida': 1, 'robert-menendez': 1, 'robert-koons': 1, 'howard-dean': 1, 'todd-staples': 1, 'occupy-democrats': 1, 'kathleen-falk': 1, 'joaquin-castro': 1, 'peter-kinder':

In [33]:
list(sorted(word_value_counters.items(),key=lambda i: 1/(max(i[1].values())+1)))[:20]

[(('Says',),
  {'Arizona-Citizens-Defense-League': 2,
   'Ballesteros': 1,
   'Yes-on-79': 1,
   'accountability-government': 1,
   'accountability-project': 1,
   'adam-hasner': 1,
   'adam-putnam': 2,
   'advancing-wisconsin': 1,
   'afscme': 2,
   'afscme-people': 1,
   'al-gore': 1,
   'alan-grayson': 7,
   'alan-simpson': 1,
   'alberta-darling': 1,
   'alex-sink': 1,
   'alison-lundergan-grimes': 4,
   'alissa-keny-guyer': 1,
   'allan-levene': 1,
   'allen-boyd': 1,
   'allen-west': 1,
   'allison-tant': 1,
   'alternativepac': 1,
   'amalgamated-transit-union': 1,
   'amanda-fritz': 3,
   'america': 1,
   'america-rising-now': 1,
   'american-bridge-21st-century': 3,
   'american-crossroads': 3,
   'american-future-fund': 2,
   'american-petroleum-institute': 1,
   'american-principles-action': 1,
   'american-unity-pac': 1,
   'americans-job-security': 1,
   'americans-prosperity': 4,
   'americans-prosperity-florida': 1,
   'americans-responsible-solutions': 1,
   'americans-