Repository for this code: https://github.com/TheCDC/CSC413_Midterm_Project

Test out tokenization method(s) and verify by eye.

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = [ w for w in words if w not in stopWords]
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


Define a function to strip unwanted input i.e. stopwords, etc.

In [2]:
def lex(statement):
    words = []
    for w in word_tokenize(statement.lower()):
        if w not in stopWords:
            words.append(w)
    return words
    return [ w for w in word_tokenize(statement) if w not in stopWords]

In [3]:
lex(data)

['work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 '.',
 'work',
 'play',
 'makes',
 'jack',
 'dull',
 'boy',
 '.']

Define a function for feature extraction.

In [4]:
def parse(lexed_statement,ns=(1,2),skipgrams=True):
    out = []
    for n in ns:
        for i in range(len(lexed_statement)):
            out.append(tuple(lexed_statement[i:i+n]))
    for index, item in enumerate(lexed_statement):
        try:
            out.append((item,lexed_statement[index+2]))
        except IndexError:
            pass
    return set(tuple(out))

Verify by eye.

In [5]:
lexed  = lex(data)
parse(lexed)

{('.',),
 ('.', 'play'),
 ('.', 'work'),
 ('boy',),
 ('boy', '.'),
 ('boy', 'work'),
 ('dull',),
 ('dull', '.'),
 ('dull', 'boy'),
 ('jack',),
 ('jack', 'boy'),
 ('jack', 'dull'),
 ('makes',),
 ('makes', 'dull'),
 ('makes', 'jack'),
 ('play',),
 ('play', 'jack'),
 ('play', 'makes'),
 ('work',),
 ('work', 'makes'),
 ('work', 'play')}

Define helper function for convenience.

In [6]:
def lexed_and_parsed(statement):
    return parse(lex(statement))

Define a struct for a statement and also a  mapping from the ambiguous truth values to numbers.

In [7]:
truth_text_mapping = {
    'pants-fire':0,
    'false':0.1,
    'barely-true':0.25,
    'half-true':0.5,
    'mostly-true':0.75,
    'true':1
}
class Statement:
    def __init__(self, body, speaker, value):
        self.body = body
        self.speaker = speaker
        self.value = truth_text_mapping[value]
    
    
    @staticmethod
    def from_row(row):
        return Statement(value=row[1], body=row[2], speaker=row[4])
    
    
    def __repr__(self):
        arg_str =  str(', '.join(['='.join([i[0],repr(i[1])]) for i in vars(self).items()]))
        return "Statement({})".format(arg_str)
    
    
    def __str__(self):
        return repr(self)

Define a convenient method for loading the datasets.

In [8]:
import csv

def load_liar_data(path):
    statements = []
    with open(path) as data_file:
        reader = csv.reader(data_file, delimiter='\t', quotechar='"')
        for row in reader:
            try:
                statements.append(Statement.from_row(row))
            except IndexError:
                print(row,len(row))
    return statements
statements = load_liar_data("../datasets/LIAR/train.tsv") 
# print out some statements to verify by eye.
statements[:5]

[Statement(value=0.1, speaker='dwayne-bohac', body='Says the Annies List political group supports third-trimester abortions on demand.'),
 Statement(value=0.5, speaker='scott-surovell', body='When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.'),
 Statement(value=0.75, speaker='barack-obama', body='Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."'),
 Statement(value=0.1, speaker='blog-posting', body='Health care reform legislation is likely to mandate free sex change surgeries.'),
 Statement(value=0.5, speaker='charlie-crist', body='The economic turnaround started at the end of my term.')]

Inspect the size of the vocabulary.

In [9]:
vocabulary = set()
for s in statements:
    vocabulary.update(lexed_and_parsed(s.body))
len(vocabulary)

171406

Count how many times a given word tuple is associated with each truth value.

In [10]:
from collections import defaultdict
word_value_counters = dict()
for s in statements:
    for word_tuple in lexed_and_parsed(s.body):
        if word_tuple not in word_value_counters:
            word_value_counters[word_tuple] =  defaultdict(lambda :0 )
        counter = word_value_counters[word_tuple]
        counter.update({s.value:counter[s.value]+1}) 
word_value_counters = {k:v for k,v in word_value_counters.items() if sum(v.values()) > 1 }

Inspect that output.

In [11]:
list(sorted(word_value_counters.items(), reverse=True, key=lambda i: (max(i[1].values()))))[:10]

[(('.',),
  defaultdict(<function __main__.<lambda>>,
              {0: 804,
               0.1: 1907,
               0.25: 1609,
               0.5: 2040,
               0.75: 1916,
               1: 1630})),
 ((',',),
  defaultdict(<function __main__.<lambda>>,
              {0: 266, 0.1: 566, 0.25: 601, 0.5: 803, 0.75: 728, 1: 572})),
 (('says',),
  defaultdict(<function __main__.<lambda>>,
              {0: 229, 0.1: 468, 0.25: 446, 0.5: 487, 0.75: 401, 1: 317})),
 (('$',),
  defaultdict(<function __main__.<lambda>>,
              {0: 61, 0.1: 182, 0.25: 192, 0.5: 258, 0.75: 229, 1: 149})),
 (('percent',),
  defaultdict(<function __main__.<lambda>>,
              {0: 40, 0.1: 134, 0.25: 133, 0.5: 228, 0.75: 257, 1: 192})),
 (('(',),
  defaultdict(<function __main__.<lambda>>,
              {0: 62, 0.1: 166, 0.25: 135, 0.5: 187, 0.75: 177, 1: 140})),
 ((')',),
  defaultdict(<function __main__.<lambda>>,
              {0: 62, 0.1: 166, 0.25: 135, 0.5: 187, 0.75: 178, 1: 140})),
 (('o

Convert counts to frequencies.

In [12]:
for word,counter in word_value_counters.items():
    s = sum(counter.values())
    for k,v in counter.items():
        counter[k] = v/s
list(sorted(word_value_counters.items(), reverse=True, key=lambda i: (max(i[1].values()))))[:20]

[(('determined',), defaultdict(<function __main__.<lambda>>, {0: 1.0})),
 (('secretary', 'paying'),
  defaultdict(<function __main__.<lambda>>, {1: 1.0})),
 (('oil', 'hook'), defaultdict(<function __main__.<lambda>>, {0.5: 1.0})),
 (('scott', 'governor'),
  defaultdict(<function __main__.<lambda>>, {0.1: 1.0})),
 (('businesses', ','), defaultdict(<function __main__.<lambda>>, {0.1: 1.0})),
 (('sen.', 'bernie'), defaultdict(<function __main__.<lambda>>, {0.5: 1.0})),
 (('voted', 'house'), defaultdict(<function __main__.<lambda>>, {0.75: 1.0})),
 (('harmed', '.'), defaultdict(<function __main__.<lambda>>, {0.5: 1.0})),
 (('reproductive', 'health'),
  defaultdict(<function __main__.<lambda>>, {1: 1.0})),
 (('2008', 'democratic'),
  defaultdict(<function __main__.<lambda>>, {0.5: 1.0})),
 (('states', 'business'),
  defaultdict(<function __main__.<lambda>>, {0.5: 1.0})),
 (('care', 'illegal'), defaultdict(<function __main__.<lambda>>, {0.1: 1.0})),
 (('people', 'opposed'),
  defaultdict(<fu

Define a class to encapsulate the previously implemented functionality.

In [13]:
class NaiveBayesClassifier:
    def __init__(self, statements):
        self.classes = set()
        # construct a set of all word tuples in the dataset
        vocabulary = set()
        for s in statements:
            lexed  = lex(s.body)
            parsed = parse(lexed, ns=[1,2,3])
            vocabulary.update(parsed)
        self.vocabulary = vocabulary
        # count number of occurences of each word tuple with each truth value
        word_value_counters = dict()
        for s in statements:
            for word_tuple in lexed_and_parsed(s.body):
                if word_tuple not in word_value_counters:
                    word_value_counters[word_tuple] =  defaultdict(lambda :0 )
                counter = word_value_counters[word_tuple]
                counter.update({s.value:counter[s.value]+1}) 
        # filter out words that have only appear once
        word_value_counters = {k:v for k,v in word_value_counters.items() if sum(v.values()) < 2 }
        # convert number of occurrences to frequency
        for word,counter in word_value_counters.items():
            s = sum(counter.values())
            for k,v in counter.items():
                counter[k] = v/s
        self.word_value_counters = word_value_counters
        print("Classifier vocab size: {}".format(len(self.vocabulary)))
        
    def classify_statement(self, statement):
#         print(self.word_value_counters)
        value_probabilities = {k:[0] for k in truth_text_mapping.values()}
        words = lexed_and_parsed(statement.body)
#         build the list of coefficients for each value
        for w in words:
            if w in self.word_value_counters:
                assert isinstance(self.word_value_counters[w],defaultdict)
                for k,v in self.word_value_counters[w].items():
#                     assert isinstance(k,float)
#                     assert isinstance(v,float)
                    value_probabilities[k].append(v)
        # compute products of coefficients
        products = defaultdict(lambda: 0)
#         print(value_probabilities)
        for k,v in value_probabilities.items():
            prod = 1
            for x in v:
                prod *= (x+1)
            products[k] = prod
        # sort output classes based on how well they match the input
        sorted_classes = sorted(products.items(), key=lambda t: t[1], reverse=True)
        # return only the best match
        return sorted_classes[0]

In [14]:
cls = NaiveBayesClassifier(statements)

Classifier vocab size: 275987


In [15]:
s = statements[5]
print(s,cls.classify_statement(s))

Statement(value=1, speaker='robin-vos', body='The Chicago Bears have had more starting quarterbacks in the last 10 years than the total number of tenured (UW) faculty fired during the last two decades.') (1, 1048576.0)


In [16]:
def score(cls, statements):
    """A function to verify that the classifier does not error on a set of input"""
    c = 0
    for s in statements:
        c += cls.classify_statement(s)[0] == s.value
    return c

In [17]:
n = score(cls, statements)
print("Correct={}, set size={}, fraction={}".format(n,len(statements),n/len(statements)))

Correct=10162, set size=10241, fraction=0.9922859095791426


In [18]:
testing_dataset = load_liar_data("../datasets/LIAR/test.tsv")
n = score(cls, testing_dataset)
ls = len(testing_dataset)
print("Correct={}, set size={}, fraction={}".format(n,ls,n/ls))

Correct=270, set size=1267, fraction=0.21310181531176006


In [19]:
testing_dataset = load_liar_data("../datasets/LIAR/valid.tsv")
n = score(cls, testing_dataset)
ls = len(testing_dataset)
print("Correct={}, set size={}, fraction={}".format(n,ls,n/ls))

Correct=284, set size=1284, fraction=0.22118380062305296
