Repository for this code: https://github.com/TheCDC/CSC413_Midterm_Project

Focus on cleaning up and making the code better.

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
stopWords = set(stopwords.words('english'))


def lex(statement):
    words = []
    for w in word_tokenize(statement.lower()):
        if w not in stopWords:
            words.append(w)
    return words
    return [ w for w in word_tokenize(statement) if w not in stopWords]


def parse(lexed_statement,ns=(1,2),skipgrams=True):
    out = []
    for n in ns:
        for i in range(len(lexed_statement)):
            out.append(tuple(lexed_statement[i:i+n]))
    for index, item in enumerate(lexed_statement):
        try:
            out.append((item,lexed_statement[index+2]))
        except IndexError:
            pass
    return set(tuple(out))

In [17]:
truth_text_mapping = {
    'pants-fire':0,
    'false':0.1,
    'barely-true':0.25,
    'half-true':0.5,
    'mostly-true':0.75,
    'true':1
}
class Statement:
    def __init__(self, body, speaker, value,context):
        self.body = body
        self.speaker = speaker
        self.value = truth_text_mapping[value]
        self.context = context
    
    
    @staticmethod
    def from_row(row):
        return Statement(value=row[1],
                         body=row[2],
                         speaker=row[4],
                         context=row[13])
    
    
    def __repr__(self):
        arg_str =  str(', '.join(['='.join([i[0],repr(i[1])]) for i in vars(self).items()]))
        return "Statement({})".format(arg_str)
    
    
    def __str__(self):
        return repr(self)

In [18]:
import csv

def load_liar_data(path):
    statements = []
    with open(path) as data_file:
        reader = csv.reader(data_file, delimiter='\t', quotechar='"')
        for row in reader:
            try:
                statements.append(Statement.from_row(row))
            except IndexError:
                print(row,len(row))
    return statements
statements = load_liar_data("../datasets/LIAR/train.tsv") 
# print out some statements to verify by eye.
statements[:5]

[Statement(body='Says the Annies List political group supports third-trimester abortions on demand.', value=0.1, speaker='dwayne-bohac', context='a mailer'),
 Statement(body='When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.', value=0.5, speaker='scott-surovell', context='a floor speech.'),
 Statement(body='Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."', value=0.75, speaker='barack-obama', context='Denver'),
 Statement(body='Health care reform legislation is likely to mandate free sex change surgeries.', value=0.1, speaker='blog-posting', context='a news release'),
 Statement(body='The economic turnaround started at the end of my term.', value=0.5, speaker='charlie-crist', context='an interview on CNN')]

In [28]:
from collections import defaultdict
class NaiveBayesClassifier:
    def __init__(self, statements,lexer=lex,parser=parse):
        self.lexer = lexer
        self.parser = parser
        self.classes = set()
        
        self.vocabulary = set()

        # count number of occurences of each word tuple with each truth value
        word_value_counters = dict()
        for s in statements:
            # construct a set of all word tuples in the dataset
            lexed  = lexer(s)
            parsed = parser(lexed)
            self.vocabulary.update(parsed)
            for word_tuple in parsed:
                if word_tuple not in word_value_counters:
                    word_value_counters[word_tuple] =  defaultdict(lambda :0 )
                counter = word_value_counters[word_tuple]
                counter.update({s.value:counter[s.value]+1}) 
        # filter out words that have only too few times
        word_value_counters = {k:v for k,v in word_value_counters.items() if sum(v.values()) < 10 }
        # convert number of occurrences to frequency
        for word,counter in word_value_counters.items():
            s = sum(counter.values())
            for k,v in counter.items():
                counter[k] = v/s
        self.word_value_counters = word_value_counters
        print("Classifier vocab size: {}".format(len(self.vocabulary)))
        
    def classify_statement(self, statement):
#         print(self.word_value_counters)
        value_probabilities = {k:[0] for k in truth_text_mapping.values()}
        words = self.parser(self.lexer(statement))
#         build the list of coefficients for each value
        for w in words:
            if w in self.word_value_counters:
                for k,v in self.word_value_counters[w].items():
                    value_probabilities[k].append(v)
        # compute products of coefficients
        products = defaultdict(lambda: 0)
#         print(value_probabilities)
        for k,v in value_probabilities.items():
            prod = 1
            for x in v:
                assert x <= 1
                prod *= (x+1)
            products[k] = prod
        # sort output classes based on how well they match the input
        sorted_classes = sorted(products.items(), key=lambda t: t[1], reverse=True)
        # return only the best match
        return sorted_classes[0]
    def score(self, statements):
        return sum(abs(self.classify_statement(s)[0] - s.value) <= 0.1 for s in statements)

In [29]:
cls = NaiveBayesClassifier(statements,
                           lexer = lambda s: lex(' '.join([s.speaker, s.context, s.body])),
                           parser = lambda l: parse(l, ns=[1], skipgrams=True))

Classifier vocab size: 130104


In [30]:
def score(cls, statements):
    """A function to verify accuracy"""
    c = 0
    for s in statements:
        if abs(cls.classify_statement(s)[0] - s.value) <= 0.1:
            c += 1
        else:
            pass
#             print(cls.classify_statement(s), s.value)
    return c

In [31]:
n = cls.score(statements)
ls = len(statements)
print("Correct={}, set size={}, fraction={}".format(n,ls,n/ls))

Correct=10236, set size=10241, fraction=0.9995117664290597


In [33]:
some_statements = load_liar_data("../datasets/LIAR/test.tsv")
n = cls.score(some_statements)
ls = len(some_statements)
print("Correct={}, set size={}, fraction={}".format(n,ls,n/ls))

Correct=306, set size=1267, fraction=0.24151539068666142


# Conclusion

~25% accuracy is unsatisfactory, let's see if we can do better.