In [1]:
import os
import wget

import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import string
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
import spacy
from transformers import BertTokenizer
from tqdm import tqdm

  return torch._C._cuda_getDeviceCount() > 0
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
# download dataset if not already downloaded
liar_url = "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"
if not os.path.exists("./data/"):
  os.mkdir("./data/")
if not os.path.exists("./data/liar_dataset.zip"):
  wget.download(liar_url, "./data/liar_dataset.zip")
if not os.path.exists("./data/train.tsv"):
  !unzip ./data/liar_dataset.zip -d ./data/

In [3]:
# data loading and preprocessing
train_df = pd.read_csv("./data/train.tsv", sep="\t", usecols=[1,2], names=["label", "statement"])
valid_df = pd.read_csv("./data/valid.tsv", sep="\t", usecols=[1,2], names=["label", "statement"])

# some basic preprocessing using standard python and SpaCy
def simple_preprocess_txt(texts: list, nlp: spacy.language):
    """
    Applies the following transformations to strings:
        - lower case
        - remove punctuation
        - tokenize
        - remove stopwrods
    
    Output: list(str)
    """
    out = list()
    for txt in texts:
        txt = txt.lower().strip()
        txt = txt.translate(str.maketrans("", "", string.punctuation))
        txt = nlp(txt) # use SpaCy for tokenization
        txt = [tok.text for tok in txt if not tok.is_stop] # for now only uses plain tokens, can later be changed to add pos or lemmas
        out.append(" ".join(txt))
    return out


nlp = spacy.load("en_core_web_sm", enable=["tokenizer"]) # load SpaCy pipeline

# apply text preprocessing
train_df["tokens"] = simple_preprocess_txt(train_df["statement"], nlp)
valid_df["tokens"] = simple_preprocess_txt(valid_df["statement"], nlp)

# convert labels to numerical values
# change to binary classification, if desired
label_dict = {
    "true":0,
    "mostly-true":0,
    "half-true":0,
    "barely-true":1,
    "false":1,
    "pants-fire":1
    }

train_df["target"] = [label_dict[lab] for lab in train_df.label]
valid_df["target"] = [label_dict[lab] for lab in valid_df.label]

train_df.head()

Unnamed: 0,label,statement,tokens,target
0,false,Says the Annies List political group supports ...,says annies list political group supports thir...,1
1,half-true,When did the decline of coal start? It started...,decline coal start started natural gas took st...,0
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",hillary clinton agrees john mccain voting geor...,0
3,false,Health care reform legislation is likely to ma...,health care reform legislation likely mandate ...,1
4,half-true,The economic turnaround started at the end of ...,economic turnaround started end term,0


In [4]:
# naive bayes classifier with sparse vector representation
# vectorize input
vectorizer = TfidfVectorizer()

train_vecs = vectorizer.fit_transform(train_df["tokens"])
valid_vecs = vectorizer.transform(valid_df["tokens"])

# fit model
multi_nb_clf = MultinomialNB(alpha=0.01)
multi_nb_clf.fit(train_vecs, train_df["target"])

# evaluate
pred = multi_nb_clf.predict(valid_vecs)
metrics.f1_score(valid_df["target"], pred, average="macro")

0.5713189352781258

In [5]:
# alternative method using dense vectors
glove_vecs = api.load("glove-wiki-gigaword-100") # load global embeddings from glove; takes ~20s

def embed_texts(texts: list, model):
    """
    Returns sentence embeddings by looking up word embeddings for single words and averaging over them.

    Output: list(array)
    """
    embedded_texts = []
    for text in texts:
        word_embeddings = []
        for token in text.split():
            try:
                # retrieve GloVe embedding for each token
                embedding = model[token]
                word_embeddings.append(embedding)
            except KeyError:
                # OOV words
                pass
        # aggregate word embeddings to get sentence embedding
        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)  # using simple average over word embeddings
            embedded_texts.append(sentence_embedding)
        else:
            # if no valid word embeddings found, append zeros
            embedded_texts.append(np.zeros_like(model["the"])) # use any word embedding just to get appropriate size
    return embedded_texts

# embed input data
train_vecs = embed_texts(train_df["tokens"].astype(str), glove_vecs)
valid_vecs = embed_texts(valid_df["tokens"].astype(str), glove_vecs)

In [6]:
# fit classifier; takes ~35s
gb_clf = GradientBoostingClassifier()
gb_clf.fit(train_vecs, train_df["target"])

# evaluate
pred = gb_clf.predict(valid_vecs)
metrics.f1_score(valid_df["target"], pred, average="macro")

0.5774449239795774

In [7]:
# attempt to load bert embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


def encoder(sentences: list):
    encoded_dict = tokenizer.batch_encode_plus(
        sentences,
        add_special_tokens=False,
        return_tensors="np",
        max_length=712,
        padding="max_length",
        truncation=False
    )
    return encoded_dict["input_ids"]


train_vecs = encoder(train_df.statement)
valid_vecs = encoder(valid_df.statement)

tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 78.7kB/s]
