# AIR Project
## Group 17

We are using a dataset which includes non-fake as well as fake news (labeled dataset).
https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification


## Imports and specific settings

In [214]:
import ast
import os.path
import sys
from collections import OrderedDict

import WordCloud as WordCloud
import matplotlib
import torch
from torch import nn
from torch.utils.data import DataLoader
from gensim.models import Word2Vec
import multiprocessing
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torchmetrics.classification import BinaryF1Score, BinaryRecall, BinaryPrecision, BinaryAccuracy
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from rank_bm25 import BM25Okapi
import math
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'WordCloud'

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
print("running models with {}".format(device))
# device = 'cpu'
if not os.path.exists("data"):
    os.mkdir("data")

## Defining Result Dicts

In [None]:
results = {}

def createDictElement(r):
    total_loss, f1, acc, prec, rec, time = r
    tmp = {}
    tmp["total_loss"] = total_loss
    tmp["f1_score"] = f1
    tmp["accuracy"] = acc
    tmp["precision"] = prec
    tmp["recall"] = rec
    tmp["time"] = time
    return tmp

## Loading of dataset and nan-removal
Uncomment function-call to redo preprocessing
Result is saved in data/data.csv

In [None]:
def pre_process():
    data = pd.read_csv('WELFake_Dataset.csv', index_col=0)
    # display(data[:300])
    for i,x in data.iterrows():
        if len(str(x["text"])) <= 10:
            data.loc[i, "text"] = np.nan
        if len(str(x["title"])) <= 10:
            data.loc[i, "title"] = np.nan

    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.to_csv("data/data.csv")
    display(data[:300])

In [None]:
if not os.path.exists("data/data.csv"):
    pre_process()

## Tokenization
Uncomment function-call to redo tokenization for data/data.csv
Result is saved in data/data_token.csv

In [None]:
def tokenize():
    stop = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    punc = [u'\u201c',u'\u201d',u'\u2018',u'\u2019',u'\u2024',u'\u2025',u'\u2026',u'\u2027']
    # print(punc)
    data = pd.read_csv('data/data.csv', index_col=0)
    titles = list()
    texts = list()
    for i, row in data.iterrows():
        title = str(row["title"])
        text = str(row["text"])
        t1 = ""
        for c in title:
            if not (c in string.punctuation or c in punc):
                t1 += c
            else:
                t1 += " "
        t2 = ""
        for c in text:
            if not (c in string.punctuation or c in punc):
                t2 += c
            else:
                t2 += " "
        title_tokens = nltk.tokenize.word_tokenize(t1)
        text_tokens = nltk.tokenize.word_tokenize(t2)
        # title_filtered = [w.lower() for w in title_tokens if not w.lower() in string.punctuation]
        # title_filtered = [w.lower() for w in title_filtered if not w.lower() in punc]
        title_filtered = [w.lower() for w in title_tokens if not w.lower() in stop]
        title_stemmed = [stemmer.stem(w) for w in title_filtered]
        # text_filtered = [w.lower() for w in text_tokens if not w.lower() in string.punctuation]
        # text_filtered = [w.lower() for w in text_filtered if not w.lower() in punc]
        text_filtered = [w.lower() for w in text_tokens if not w.lower() in stop]
        text_stemmed = [stemmer.stem(w) for w in text_filtered]
        # print(title_stemmed)
        # print(text_stemmed)
        titles.append(title_stemmed)
        texts.append(text_stemmed)
        if i % 5000 == 0:
            print(i)
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_cleaned = pd.DataFrame(data=d)
    # data_cleaned["title"] = titles
    # data_cleaned["text"] = texts
    data_cleaned.to_csv("data/data_token.csv")


In [None]:
if not os.path.exists("data/data_token.csv"):
    tokenize()

## Bag of Words


In [None]:
def make_bow(data_path):
    data = pd.read_csv(data_path, index_col=0)
    bow = []
    bow_title = []
    bow_text = []
    bow_both = []
    for i, row in data.iterrows():
        words = row["title"].split(",")
        title = []
        for word in words:
            title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        words = row["text"].split(",")
        text = []
        for word in words:
            text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        dic_title = {}
        dic_text = {}
        dic_both = {}
        for word in title:
            if word in dic_title:
                dic_title[word] = dic_title[word] + 1
            else:
                dic_title[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        for word in text:
            if word in dic_text:
                dic_text[word] = dic_text[word] + 1
            else:
                dic_text[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        bow_text.append(dic_text)
        bow_title.append(dic_title)
        bow_both.append(dic_both)
    bow.append(bow_title)
    bow.append(bow_text)
    bow.append(bow_both)
    return bow

In [215]:
# bow = [bow_title[],bow_text[],bow_both[]]
# bow = make_bow('data_tokenized/data_token.csv') # was uncommented

## TFIDF with Cosine

In [216]:
def tf(bow_):
    tf_ = []
    for dic in bow_:
        max_ = 0
        for i in dic:
            if dic[i] > max_:
                max_ = dic[i]
        tf_dic = {}
        for word in dic:
            tf_dic[word] = dic[word]/max_
        tf_.append(tf_dic)
    return tf_

def idf(bow_):
    df_ = {}
    for dic in bow_:
        for word in dic:
            if word in df_:
                df_[word] += 1
            else:
                df_[word] = 1
    idf_ = {}
    for word in df_:
        idf_[word] = math.log10(len(bow)/df_[word])
    return idf_

def tf_idf(bow_):
    tf_ = tf(bow_)
    idf_ = idf(bow_)
    tfidf = []
    for dic in tf_:
        tfidf_dic = {}
        for word in dic:
            tfidf_dic[word] = dic[word] * idf_[word]
        tfidf.append(tfidf_dic)
    return tfidf

def cosineSim(dic_a, dic_b):
    for word in dic_a:
        if word not in dic_b:
            dic_b[word] = 0
    for word in dic_b:
        if word not in dic_a:
            dic_a[word] = 0
    dot, sum_a, sum_b = 0,0,0
    for word in dic_a:
        a = dic_a[word]
        b = dic_b[word]
        dot += (a*b)
        sum_a += math.pow(a,2)
        sum_b += math.pow(b,2)
    sqrt_sum_a = math.sqrt(sum_a)
    sqrt_sum_b = math.sqrt(sum_b)
    return dot / (sqrt_sum_a * sqrt_sum_b)

def tfidf_cosine_ranking(word_, bow_):
    tfidf_all = tf_idf(bow_)
    list_query = [{word_: 1}]
    tfidf_query = tf_idf(list_query)[0]
    article_index = []
    cosSim = []
    cos_index = 0
    for a in tfidf_all:
        article_index.append(cos_index)
        cosSim.append(cosineSim(a,tfidf_query))
        cos_index += 1
    return pd.DataFrame({'article': article_index ,'value': cosSim }).sort_values(by=['value'], ascending=False)



In [217]:
#cos_rank = tfidf_cosine_ranking('obama',bow[2])
#print(cos_rank.head(5))

## bm25

In [218]:
def bm25_ranking(query_,index_):
    data = pd.read_csv('data_tokenized/data_token.csv', index_col=0)
    corpus = []
    title = []
    text = []
    both = []
    for i, row in data.iterrows():
            words = row["title"].split(",")
            for word in words:
                title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
            words = row["text"].split(",")
            for word in words:
                text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
    if index_ == 0:
        corpus = title
    elif index_ == 1:
        corpus = text
    else:
        for i in title:
            both.append(title + text)
        corpus = both

    print("Starting bm25")
    bm25 = BM25Okapi(corpus)
    bm25_scores = bm25.get_scores(query_.split(" "))

    article_index = []
    bm25_index = 0
    for a in bm25_scores:
        article_index.append(bm25_index)
        bm25_index += 1
    return pd.DataFrame({'article': article_index ,'value': bm25_scores }).sort_values(by=['value'], ascending=False)


In [219]:
# bm25_rank = bm25_ranking('sunday',bow[2]) # was uncommented
# print(bm25_rank.head(5)) # was uncommented

## Creating and Training Word2Vec Model

In [220]:
load_model_from_disc = True
w2v_model = None
data = pd.read_csv('data/data_token.csv', index_col=0)
# for i, row in data.iterrows():
#     print(type(row["title"]))
#     print(row)
#     data.loc[i, "title"] = ast.literal_eval(row["title"])
#     data.loc[i, "text"] = ast.literal_eval(row["text"])
if load_model_from_disc:
    try:
        w2v_model = Word2Vec.load("word2vec.model")
    except:
        pass

if w2v_model is None or not load_model_from_disc:
    if load_model_from_disc:
        print("Could not load model from disc. Training model...")
    else:
        print("Loading from disc deactivated. Training model...")

    class MySentences(object):
        def __init__(self, data):
            self.data = data

        def __iter__(self):
            for doc in pd.concat([data["text"], data["title"]]): #change to "title" or combine both
                doc = ast.literal_eval(doc)
                yield doc

    sentences = MySentences(data)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores-1)

    w2v_model.build_vocab(sentences, progress_per=10000)
    t = time.time()
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=3, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))
    w2v_model.save("word2vec.model")
else:
    print("Model loaded from disc.")

Model loaded from disc.


In [221]:
# calculate similarity
w2v_model.wv.similarity("amazon", 'nazi')

-0.11904364

In [222]:
# calculate similarity
w2v_model.wv.similarity("obama", 'trump')

0.60530704

In [223]:
# find out which element doesn't match
w2v_model.wv.doesnt_match(['amazon', 'obama', 'trump'])

'amazon'

In [224]:
# Which word is to obama as georg is to bush?
w2v_model.wv.most_similar(positive=["obama", "georg"], negative=["bush"], topn=3)

[('barack', 0.607993483543396),
 ('presid', 0.4728606641292572),
 ('behest', 0.4636874198913574)]

In [225]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["obama"])

[('barack', 0.832769513130188),
 ('administr', 0.6584988832473755),
 ('presid', 0.6397863626480103),
 ('predecessor', 0.625878632068634),
 ('trump', 0.6053071022033691),
 ('bush', 0.557529628276825),
 ('outgo', 0.5535241961479187),
 ('undo', 0.5471165180206299),
 ('holdov', 0.5300476551055908),
 ('clinton', 0.5224902629852295)]

In [226]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["presid"])

[('barack', 0.7051833868026733),
 ('45th', 0.6659713387489319),
 ('successor', 0.6399291157722473),
 ('obama', 0.6397863626480103),
 ('administr', 0.6288126111030579),
 ('trump', 0.6117547750473022),
 ('donald', 0.6109979748725891),
 ('predecessor', 0.6096833348274231),
 ('pres', 0.594482958316803),
 ('presidenti', 0.5868942737579346)]

## Creating Doc2Vec
word2vec for each word with average over document

In [227]:
# creates w2v representation for all documents and titles
def doc2vec():
    titles = list()
    texts = list()
    start = time.time()
    for i, row in data.iterrows():
        vec_title = np.zeros(shape=w2v_model.vector_size)
        vec_text = np.zeros(shape=w2v_model.vector_size)
        tit = ast.literal_eval(row["title"])
        tex = ast.literal_eval(row["text"])
        tit_cnt = 0
        tex_cnt = 0
        for word in tit:
            try:
                vec_title += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tit_cnt += 1
                pass
        for word in tex:
            try:
                vec_text += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tex_cnt += 1
                pass
        if len(tit) > tit_cnt:
            vec_title /= (len(tit) - tit_cnt)
        if len(tex) > tex_cnt:
            vec_text /= (len(tex) - tex_cnt)
        titles.append(vec_title.tolist())
        texts.append(vec_text.tolist())
        if i % 5000 == 0:
            print("[{}/{}] - {:.1f}s".format(i, len(data.index), time.time() - start))
    end = time.time()
    print("creating doc2vec took {:.1f}s".format(end - start))
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_w2v = pd.DataFrame(data=d)
    data_w2v.to_pickle("data/data_w2v.pkl")
    display(data_w2v[:100])

In [228]:
if not os.path.exists("data/data_w2v.pkl"):
    doc2vec()

## Train-Test-split and Dataloader Creation

In [229]:
def collate_batch(batch):
    labels = list()
    texts = list()
    for (_text, _label) in batch:
        labels.append(_label)
        texts.append(_text)

    return torch.tensor(texts), torch.tensor(labels)

In [230]:
embed_dim = 100
batch_size = 100
params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 0,
          'collate_fn': collate_batch,
          'drop_last': True}


data_d2v = pd.read_pickle("data/data_w2v.pkl")
titles = list()
texts = list()
# print("interpreting data")
# for i, row in data_d2v.iterrows():
#     titles.append(ast.literal_eval(row["title"]))
#     texts.append(ast.literal_eval(row["text"]))
# print("done interpreting data")
# data_d2v["title"] = titles
# data_d2v["text"] = texts

data_d2v_title = data_d2v[["title", "label"]].copy()
data_d2v_text = data_d2v[["text", "label"]].copy()
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(data_d2v_title["title"], data_d2v_title["label"], test_size=0.15, random_state=42, shuffle=True)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(data_d2v_text["text"], data_d2v_text["label"], test_size=0.15, random_state=42, shuffle=True)

X_train_title.reset_index(drop=True, inplace=True)
X_test_title.reset_index(drop=True, inplace=True)
y_train_title.reset_index(drop=True, inplace=True)
y_test_title.reset_index(drop=True, inplace=True)
X_train_text.reset_index(drop=True, inplace=True)
X_test_text.reset_index(drop=True, inplace=True)
y_train_text.reset_index(drop=True, inplace=True)
y_test_text.reset_index(drop=True, inplace=True)


class data_set(Dataset):
    def __init__(self, X, y):
        super(Dataset, self).__init__()
        assert len(X.index) == len(y.index)
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X.index)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

train_dataset_title = data_set(X_train_title, y_train_title)
test_dataset_title = data_set(X_test_title, y_test_title)
train_dataset_text = data_set(X_train_text, y_train_text)
test_dataset_text = data_set(X_test_text, y_test_text)

train_dataloader_title = DataLoader(train_dataset_title, **params)
test_dataloader_title = DataLoader(test_dataset_title, **params)
train_dataloader_text = DataLoader(train_dataset_text, **params)
test_dataloader_text = DataLoader(test_dataset_text, **params)

for batch, (X, y) in enumerate(train_dataloader_title):
    # print(X)
    # print(X.shape)
    # print(y.shape)
    break

## Helper Functions

In [231]:
def add_metrics_to_log(log, metrics, y_true, y_pred, prefix=''):
    for metric in metrics:
        q = metric(y_true, y_pred)
        log[prefix + metric.__name__] = q
    return

def log_to_message(log, precision=4):
    fmt = "{0}: {1:." + str(precision) + "f}"
    return "    ".join(fmt.format(k, v) for k, v in log.items())

class ProgressBar(object):
    """Cheers @ajratner"""

    def __init__(self, n, length=40):
        # Protect against division by zero
        self.n      = max(1, n)
        self.nf     = float(n)
        self.length = length
        # Precalculate the i values that should trigger a write operation
        self.ticks = set([round(i/100.0 * n) for i in range(101)])
        self.ticks.add(n-1)
        self.bar(0)

    def bar(self, i, message=""):
        """Assumes i ranges through [0, n-1]"""
        if i in self.ticks:
            b = int(np.ceil(((i+1) / self.nf) * self.length))
            sys.stdout.write("\r[{0}{1}] {2}%\t{3}".format(
                "="*b, " "*(self.length-b), int(100*((i+1) / self.nf)), message
            ))
            sys.stdout.flush()

    def close(self, message=""):
        # Move the bar to 100% before closing
        self.bar(self.n-1)
        sys.stdout.write("\n{0}\n\n".format(message))
        sys.stdout.flush()

## Training Loop

In [232]:
epochs = 10

def train(dataloader, model, loss_fn, optimizer):
    start_time = time.time()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    total_loss = 0
    f1 = 0
    precision = 0
    recall = 0
    accuracy = 0
    f1_score_ = BinaryF1Score().to(device)
    precision_ = BinaryPrecision().to(device)
    recall_ = BinaryRecall().to(device)
    accuracy_ = BinaryAccuracy().to(device)
    pb = ProgressBar(size/batch_size)
    log = OrderedDict()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred.squeeze(), y.float())
        total_loss += loss.item()
        f1 += f1_score_(pred.squeeze(), y)
        precision += precision_(pred.squeeze(), y)
        recall += recall_(pred.squeeze(), y)
        accuracy += accuracy_(pred.squeeze(), y)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        log['loss'] = float(loss) / (batch + 1)
        log['f1'] = float(f1) / (batch + 1)
        log['accuracy'] = float(accuracy) / (batch + 1) #correct / ((batch + 1) * batch_size)
        log['precision'] = float(precision) / (batch + 1)
        log['recall'] = float(recall) / (batch + 1)
        log['time'] = time.time() - start_time
        pb.bar(batch, log_to_message(log))
    pb.close(log_to_message(log))
    return total_loss, (f1/num_batches).item(), (accuracy/num_batches).item(), (precision/num_batches).item(), (recall/num_batches).item(), time.time()-start_time

def test(dataloader, model, loss_fn):
    start_time = time.time()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0
    f1 = 0
    precision = 0
    recall = 0
    accuracy = 0
    f1_score_ = BinaryF1Score().to(device)
    precision_ = BinaryPrecision().to(device)
    recall_ = BinaryRecall().to(device)
    accuracy_ = BinaryAccuracy().to(device)
    pb = ProgressBar(size/batch_size)
    log = OrderedDict()
    model.eval()
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred.squeeze(), y.float())
            f1 += f1_score_(pred.squeeze(), y)
            precision += precision_(pred.squeeze(), y)
            recall += recall_(pred.squeeze(), y)
            accuracy += accuracy_(pred.squeeze(), y)
            # Backpropagation
            log['loss'] = float(test_loss) / (batch + 1)
            log['f1'] = float(f1) / (batch + 1)
            log['accuracy'] = float(accuracy) / (batch + 1) #correct / ((batch + 1) * batch_size)
            log['precision'] = float(precision) / (batch + 1)
            log['recall'] = float(recall) / (batch + 1)
            log['time'] = time.time() - start_time
            pb.bar(batch, log_to_message(log))
    pb.close(log_to_message(log))
    return test_loss.item(), (f1/num_batches).item(), (accuracy/num_batches).item(), (precision/num_batches).item(), (recall/num_batches).item(), time.time()-start_time

def train_and_evaluate(train_dataloader, test_dataloader, model, loss_fn, optimizer, training=True):
    if training:
        for i in range(epochs):
            print("Epoch {}: ------------------------------------------------------------------------------------------------------------------------".format(i+1))
            train(train_dataloader, model, loss_fn, optimizer)
    print("Evaluation ------------------------------------------------------------------------------------------------------------------------")
    return test(test_dataloader, model, loss_fn)

## Simple Model for testing of training loop

In [233]:
class SimpleLinearModel(nn.Module):
    def __init__(self):
        super(SimpleLinearModel, self).__init__()
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, vec):
        x = self.relu(self.fc1(vec))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return torch.sigmoid(self.fc4(x))

model = SimpleLinearModel().to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())


# r = train_and_evaluate(train_dataloader_title, test_dataloader_title, model, loss_fn, optimizer)
# results["simple_linear_model"] = {}
# results["simple_linear_model"]["title"] = createDictElement(r)
#
# model = SimpleLinearModel().to(device)
# loss_fn = nn.BCELoss()
# optimizer = torch.optim.Adam(model.parameters())
#
# r = train_and_evaluate(train_dataloader_text, test_dataloader_text, model, loss_fn, optimizer)
# results["simple_linear_model"]["text"] = createDictElement(r)

## Simple Convolution Model

In [234]:
class SimpleConvolutionModel(nn.Module):
    def __init__(self):
        super(SimpleConvolutionModel, self).__init__()
        self.fc1 = nn.Linear(embed_dim, 100)
        self.conv = nn.Conv2d(in_channels=100, out_channels=100, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=1, stride=1)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.conv(x.unsqueeze(1))
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.squeeze()
        x = torch.sigmoid(self.fc2(x))
        return x


model = SimpleConvolutionModel().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
results["simple_convolution_model"] = {}

# r = train_and_evaluate(train_dataloader_title, test_dataloader_title, model, loss_fn, optimizer)
# results["simple_convolution_model"]["title"] = createDictElement(r)

model = SimpleConvolutionModel().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# r = train_and_evaluate(train_dataloader_text, test_dataloader_text, model, loss_fn, optimizer)
# results["simple_convolution_model"]["text"] = createDictElement(r)

In [235]:
class FA_KES(nn.Module):
    def __init__(self):
        super(FA_KES, self).__init__()
        self.fc1 = nn.Linear(embed_dim, 100)
        # self.embed = nn.Embedding(
        #     num_embeddings=len(w2v_model.wv),
        #     embedding_dim=100)
        self.conv = nn.Conv1d(in_channels=100, out_channels=100, kernel_size=6, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1)
        # self.fc2 = nn.Linear(95, 1)
        self.lstm = nn.LSTM(96, 32, 3, bidirectional=True)
        self.dropout = nn.Dropout(0.1)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(self.conv(x))
        x = self.maxpool(x)
        x = x.squeeze()
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = torch.sigmoid(self.fc3(x))

        return x


model = FA_KES().to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())
results["lstm_model"] = {}

r = train_and_evaluate(train_dataloader_title, test_dataloader_title, model, loss_fn, optimizer)
results["lstm_model"]["title"] = createDictElement(r)

model = FA_KES().to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

r = train_and_evaluate(train_dataloader_text, test_dataloader_text, model, loss_fn, optimizer)
results["lstm_model"]["text"] = createDictElement(r)



Epoch 1: ------------------------------------------------------------------------------------------------------------------------

KeyboardInterrupt: 

## Support Vector Machine

Using a support vector machine (SVM) to label between fake and real news.
First we are using the text as input afterwards just the titles.

In [None]:
def svm(X_train_data, y_train_data, X_test_data, y_test_data):
    X_train = X_train_data.tolist()
    X_test = X_test_data.tolist()
    start_time = time.time()
    clf = SVC(kernel='rbf')
    print("fitting...")
    clf.fit(X_train,y_train_data)
    print("predicting...")
    y_pred = clf.predict(X_test)
    return 0, f1_score(y_test_data,y_pred), accuracy_score(y_test_data, y_pred), precision_score(y_test_data, y_pred), recall_score(y_test_data, y_pred), time.time()-start_time


In [None]:
r = svm(X_train_text, y_train_text, X_test_text, y_test_text)
loss, f1_score_text, accuracy_text, precision_text, recall_text, time_text = r
print("Accuracy for text: {:.2f}%\nF1 score for text: {:.2f}%\nPrecision score for text: {:.2f}%\nRecall score for text: {:.2f}%".format(accuracy_text*100, f1_score_text*100, precision_text*100, recall_text*100))
results["support_vector_machine"] = {}
results["support_vector_machine"]["text"] = createDictElement(r)

In [None]:
r = svm(X_train_title, y_train_title, X_test_title, y_test_title)
loss, f1_score_title, accuracy_title, precision_title, recall_title, time_title = r
print("Accuracy for title: {:.2f}%\nF1 score for title: {:.2f}%\nPrecision score for title: {:.2f}%\nRecall score for title: {:.2f}%".format(accuracy_title*100, f1_score_title*100, precision_title*100, recall_title*100))
results["support_vector_machine"]["title"] = createDictElement(r)

## Random Forest Algorithm

Using the random forest algorithm (RF) to label between fake and real news.
First we are using the text as input afterwards just the titles.

In [None]:
def random_forest(X_train, X_test, y_train, y_test, name, crit='gini'):
    start_time = time.time()
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 42, verbose=0, n_jobs=-1, criterion=crit)
    rf.fit(X_train.tolist(), y_train.tolist())
    y_pred = rf.predict(X_test.tolist())
    acc = accuracy_score(y_test.tolist(), y_pred)
    f1 = f1_score(y_test.tolist(), y_pred)
    prec = precision_score(y_test.tolist(), y_pred)
    rec = recall_score(y_test.tolist(), y_pred)
    print("accuracy for random forest with {} criterion {}: {:.2f}%".format(crit, name, acc*100))
    print("f1 score for random forest with {} criterion{}: {:.2f}%".format(crit, name, f1*100))
    print("precision score random forest with {} criterion for {}: {:.2f}%".format(crit, name, prec*100))
    print("recall score for random forest with {} criterion {}: {:.2f}%".format(crit, name, rec*100))
    return 0, f1, acc, prec, rec, time.time()-start_time

results["random_forest"] = {}
r = random_forest(X_train_text, X_test_text, y_train_text, y_test_text, "w2v text")
results["random_forest"]["text"] = createDictElement(r)
r = random_forest(X_train_title, X_test_title, y_train_title, y_test_title, "w2v title")
results["random_forest"]["title"] = createDictElement(r)

In [None]:
print(results)

## Prinicipal Component Analysis

In [None]:
def pca_(X_test, y_test):
    pca = PCA()
    pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
    plt.figure(figsize=(8, 6))
    Xt = pipe.fit_transform(X_test.tolist())
    plot = plt.scatter(Xt[:,0], Xt[:,1], c=y_test.tolist(), alpha=0.6, s=0.9, cmap='plasma')
    plt.legend(handles=plot.legend_elements()[0], labels=["real news", "fake news"])
    plt.show()

pca_(X_test_text, y_test_text)
pca_(X_test_title, y_test_title)

In [None]:
def wc():
    data_pre = pd.read_csv("data_tokenized/data_token_no_stem.csv", index_col=0)
    data_pre_real = data_pre.loc[data_pre["label"] == 0]
    data_pre_fake = data_pre.loc[data_pre["label"] == 1]
    display(data_pre_real[:10])
    display(data_pre_fake[:10])
    data_pre = None
    x = ""
    for i, vals in data_pre_real.iterrows():
        for val in vals["title"]:
            x += " "
            x += val
        for val in vals["text"]:
            x += " "
            x += val
        if i % 1000 == 0:
            print(i)

    wordcloud = WordCloud().generate(x)

    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

Unnamed: 0,title,text,label
2,"['bobby', 'jindal', 'raised', 'hindu', 'uses', 'story', 'christian', 'conversion', 'woo', 'evangelicals', 'potential', '2016', 'bid']","['dozen', 'politically', 'active', 'pastors', 'came', 'private', 'dinner', 'friday', 'night', 'hear', 'conversion', 'story', 'unique', 'context', 'presidential', 'politics', 'louisiana', 'gov', 'bobby', 'jindal', 'traveled', 'hinduism', 'protestant', 'christianity', 'ultimately', 'became', 'calls', 'evangelical', 'catholic', 'two', 'hours', 'jindal', '42', 'recalled', 'talking', 'girl', 'high', 'school', 'wanted', 'save', 'soul', 'reading', 'bible', 'closet', 'parents', 'would', 'see', 'feeling', 'stir', 'watching', 'movie', 'senior', 'year', 'depicted', 'jesus', 'cross', 'struck', 'struck', 'hard', 'jindal', 'told', 'pastors', 'son', 'god', 'died', 'sins', 'jindal', 'session', 'christian', 'clergy', 'lead', 'congregations', 'early', 'presidential', 'battleground', 'states', 'iowa', 'south', 'carolina', 'part', 'behind', 'scenes', 'effort', 'louisiana', 'governor', 'find', 'political', 'base', 'could', 'help', 'propel', 'top', 'tier', 'republican', 'candidates', 'seeking', 'run', 'white', 'house', '2016', 'known', 'gop', 'circles', 'mostly', 'mastery', 'policy', 'issues', 'health', 'care', 'jindal', 'rhodes', 'scholar', 'graduate', 'ivy', 'league', 'brown', 'university', 'obvious', 'pool', 'activist', 'supporters', 'help', 'drive', 'excitement', 'outside', 'home', 'state', 'harnessing', 'religious', 'experience', 'way', 'begun', 'appeal', 'parts', 'gop', 'influential', 'core', 'religious', 'conservatives', 'many', 'yet', 'find', 'favorite', 'among', 'republicans', 'eyeing', 'presidential', 'race', 'potential', '2016', 'gop', 'candidates', 'wooing', 'evangelical', 'base', 'including', 'sens', 'rand', 'paul', 'ky', 'ted', 'cruz', 'tex', 'indiana', 'gov', 'mike', 'pence', 'weekend', 'lynchburg', '—', 'mecca', 'sorts', 'evangelicals', 'home', 'liberty', 'university', 'founded', '1970s', 'rev', 'jerry', 'falwell', '—', 'jindal', 'appeared', 'make', 'progress', 'addition', 'dinner', 'pastors', 'delivered', 'well', 'received', 'call', 'action', 'address', '40', '000', 'christian', 'conservatives', 'gathered', 'liberty', 'commencement', 'ceremony', 'talking', 'faith', 'assailing', 'said', 'president', 'obama', 'record', 'attacking', 'religious', 'liberty', 'pastors', 'came', 'meet', 'jindal', 'said', 'intimate', 'descriptions', 'experiences', 'stood', 'convictions', 'takes', 'communicate', 'said', 'brad', 'sherman', 'solid', 'rock', 'christian', 'church', 'coralville', 'iowa', 'sherman', 'helped', 'former', 'arkansas', 'governor', 'mike', 'huckabee', 'winning', '2008', 'campaign', 'delegates', 'iowa', 'another', 'huckabee', 'admirer', 'rev', 'c', 'mitchell', 'brooks', 'second', 'baptist', 'church', 'belton', 'c', 'said', 'jindal', 'commitment', 'christian', 'values', 'compelling', 'story', 'put', 'par', 'huckabee', 'baptist', 'preacher', 'entering', 'politics', 'visiting', 'pastors', 'flew', 'lynchburg', 'weekend', 'invitation', 'american', 'renewal', 'project', 'well', 'funded', 'nonprofit', 'group', 'encourages', 'evangelical', 'christians', 'engage', 'civic', 'arena', 'voter', 'guides', 'get', 'vote', 'drives', 'programs', 'train', 'pastors', 'grass', 'roots', 'activism', 'group', 'founder', 'david', 'lane', 'built', 'pastor', 'network', 'politically', 'important', 'states', 'iowa', 'missouri', 'ohio', 'south', 'carolina', 'led', 'trips', 'israel', 'paul', 'others', 'seeking', 'make', 'inroads', 'evangelical', 'activists', 'group', 'lane', 'invited', 'lynchburg', 'included', 'donald', 'wild\xadmon', 'retired', 'minister', 'founder', 'american', 'family', 'association', 'prominent', 'evangelical', 'activist', 'group', 'influence', 'network', '140', 'christian', 'radio', 'stations', 'pastors', 'lane', 'organization', 'brought', 'lynchburg', 'met', 'jindal', 'said', 'captured', 'interest', 'recently', 'stepped', 'forward', 'defend', 'phil', 'robertson', 'patriarch', 'duck', 'dynasty', 'television', 'show', 'family', 'amid', 'controversy', 'disparaging', 'remarks', 'made', 'gays', 'interview', 'gq', 'magazine', 'throughout', 'lynchburg', 'visit', 'jindal', 'presented', 'willing', 'culture', 'warrior', 'commencement', 'address', 'saturday', 'took', 'cause', 'twin', 'brothers', 'whose', 'hgtv', 'reality', 'series', 'renovating', 'reselling', 'houses', 'flip', 'forward', 'canceled', 'last', 'week', 'web', 'site', 'revealed', 'protested', 'sex', 'marriage', '2012', 'democratic', 'national', 'convention', 'charlotte', 'siblings', 'jason', 'david', 'benham', 'liberty', 'graduates', 'attended', 'graduation', 'private', 'lunch', 'jindal', 'called', 'action', 'another', 'demonstration', 'intolerance', 'entertainment', 'industry', 'guys', 'protested', 'republican', 'party', 'convention', 'instead', 'canceling', 'show', 'hgtv', 'would', 'probably', 'given', 'raise', 'jindal', 'said', 'liberty', 'crowd', 'applauded', 'cited', 'hobby', 'lobby', 'craft', 'store', 'chain', 'faced', 'legal', 'challenge', 'refusing', 'provide', 'employees', 'insurance', 'coverage', 'contraceptives', 'required', 'affordable', 'care', 'act', 'members', 'family', 'owns', 'hobby', 'lobby', 'become', 'heroes', 'many', 'religious', 'conservatives', 'said', 'morally', 'opposed', 'use', 'certain', 'types', 'birth', 'control', 'considered', 'requirement', 'violation', 'first', 'amendment', 'right', 'religious', 'freedom', 'family', 'committed', 'honor', 'lord', 'generous', 'employers', 'paying', 'well', 'minimum', 'wage', 'increasing', 'salaries', 'four', 'years', 'row', 'even', 'midst', 'enduring', 'recession', 'jindal', 'told', 'liberty', 'graduates', 'none', 'matters', 'obama', 'administration', 'pastors', 'came', 'see', 'jindal', 'action', 'governor', 'story', 'highlight', 'weekend', 'many', 'ways', 'unlike', 'aspiring', 'president', 'activists', 'met', 'piyush', 'jindal', 'born', '1971', 'four', 'months', 'parents', 'arrived', 'baton', 'rouge', 'la', 'native', 'india', 'changed', 'name', 'bobby', 'young', 'boy', 'adopting', 'name', 'character', 'favorite', 'television', 'show', 'brady', 'bunch', 'decision', 'become', 'christian', 'told', 'pastors', 'come', 'one', 'moment', 'lightning', 'epiphany', 'instead', 'said', 'happened', 'phases', 'growing', 'small', 'seeds', 'planted', 'time', 'jindal', 'recalled', 'closest', 'friend', 'grade', 'school', 'gave', 'bible', 'name', 'emblazoned', 'gold', 'cover', 'christmas', 'present', 'struck', 'initially', 'unimpressive', 'gift', 'jindal', 'told', 'pastors', 'world', 'would', 'spend', 'good', 'money', 'bible', 'everyone', 'knows', 'get', 'one', 'free', 'hotel', 'recalled', 'thinking', 'time', 'gold', 'lettering', 'meant', 'give', 'away', 'return', 'religious', 'education', 'reached', 'higher', 'plane', 'junior', 'year', 'high', 'school', 'told', 'dinner', 'audience', 'wanted', 'ask', 'pretty', 'girl', 'date', 'hallway', 'conversation', 'started', 'talking', 'faith', 'god', 'opposition', 'abortion', 'girl', 'invited', 'visit', 'church', 'jindal', 'said', 'skeptical', 'set', 'investigate', 'fanciful', 'claims', 'made', 'girl', 'friends', 'started', 'reading', 'bible', 'closet', 'home', 'unsure', 'parents', 'would', 'react', 'said', 'stirring', 'moment', 'saw', 'christ', 'depicted', 'cross', 'religious', 'movie', 'bible', 'existence', 'suddenly', 'seemed', 'clearer', 'jindal', 'told', 'pastors', 'jindal', 'dwell', 'subsequent', 'conversion', 'catholicism', 'years', 'later', 'college', 'said', 'immersed', 'traditions', 'church', 'touched', 'briefly', 'commencement', 'address', 'noting', 'passing', 'best', 'described', 'evangelical', 'catholic', 'mostly', 'sought', 'showcase', 'ways', 'shares', 'values', 'christian', 'conservatives', 'read', 'words', 'jesus', 'christ', 'realized', 'true', 'jindal', 'told', 'graduates', 'saturday', 'offering', 'less', 'detailed', 'accounting', 'conversion', 'done', 'night', 'pastors', 'used', 'think', 'found', 'god', 'believe', 'accurate', 'say', 'found']",0
9,"['may', 'brexit', 'offer', 'would', 'hurt', 'cost', 'eu', 'citizens', 'eu', 'parliament']","['brussels', 'reuters', 'british', 'prime', 'minister', 'theresa', 'may', 'offer', 'settled', 'status', 'eu', 'residents', 'flawed', 'leave', 'fewer', 'rights', 'brexit', 'european', 'parliament', 'brexit', 'coordinator', 'said', 'tuesday', 'family', 'five', 'could', 'face', 'bill', '360', 'pounds', 'acquire', 'new', 'status', 'guy', 'verhofstadt', 'told', 'may', 'brexit', 'secretary', 'david', 'davis', 'letter', 'seen', 'reuters', 'significant', 'amount', 'family', 'low', 'income', 'listing', 'three', 'concerns', 'eu', 'legislature', 'must', 'approve', 'treaty', 'march', '2019', 'exit', 'verhofstadt', 'told', 'davis', 'proposals', 'eu', 'citizens', 'definitely', 'notice', 'deterioration', 'status', 'result', 'brexit', 'parliament', 'aim', 'along', 'eu', 'citizens', 'uk', 'citizens', 'eu', '27', 'notice', 'difference', 'verhofstadt', 'former', 'belgian', 'prime', 'minister', 'wrote', 'response', 'davis', 'written', 'parliament', 'complained', 'last', 'week', 'remained', 'major', 'issues', 'settled', 'rights', '3', 'million', 'eu', 'citizens', 'britain', 'tuesday', 'told', 'reporters', 'parliament', 'determined', 'expatriates', 'become', 'victims', 'brexit', 'may', 'unveiled', 'details', 'last', 'week', 'system', 'aimed', 'giving', 'people', 'already', 'britain', 'quick', 'cheap', 'way', 'asserting', 'rights', 'stay', 'indefinitely', 'issue', 'along', 'much', 'britain', 'owes', 'new', 'eu', 'uk', 'border', 'across', 'ireland', 'one', 'eu', 'wants', 'outline', 'agreement', 'opening', 'talks', 'future', 'trade', 'verhofstadt', 'said', 'lawmakers', 'dismissing', 'british', 'efforts', 'streamline', 'applications', 'saw', 'flaws', 'nature', 'settled', 'status', 'well', 'cost', 'similar', 'acquiring', 'british', 'passport', 'cited', 'three', 'others', 'europeans', 'simply', 'declare', 'whole', 'household', 'resident', 'without', 'needing', 'application', 'process', 'burden', 'proof', 'british', 'authorities', 'deny', 'rights', 'stringent', 'conditions', 'criminal', 'records', 'could', 'mean', 'eu', 'residents', 'including', 'permanent', 'resident', 'status', 'deported', 'failing', 'gain', 'settled', 'status', 'eu', 'residents', 'would', 'lose', 'rights', 'bring', 'relatives', 'britain', 'new', 'status', 'would', 'give', 'rights', 'british', 'people', 'fewer', 'rights', 'eu', 'citizens']",0
10,"['schumer', 'calls', 'trump', 'appoint', 'official', 'oversee', 'puerto', 'rico', 'relief']","['washington', 'reuters', 'charles', 'schumer', 'top', 'democrat', 'u', 'senate', 'called', 'president', 'donald', 'trump', 'sunday', 'name', 'single', 'official', 'oversee', 'coordinate', 'relief', 'efforts', 'hurricane', 'ravaged', 'puerto', 'rico', 'schumer', 'along', 'representatives', 'nydia', 'velàzquez', 'jose', 'serrano', 'said', 'ceo', 'response', 'recovery', 'needed', 'manage', 'complex', 'ongoing', 'federal', 'response', 'territory', 'millions', 'americans', 'remain', 'without', 'power', 'supplies', 'statement', 'schumer', 'said', 'current', 'federal', 'response', 'hurricane', 'maria', 'impact', 'island', 'disorganized', 'slow', 'footed', 'mismanaged', 'person', 'ability', 'bring', 'federal', 'agencies', 'together', 'cut', 'red', 'tape', 'public', 'private', 'side', 'help', 'turn', 'lights', 'back', 'get', 'clean', 'water', 'flowing', 'help', 'bring', 'recovery', 'millions', 'americans', 'gone', 'long', 'worst', 'conditions', 'said', 'white', 'house', 'immediately', 'respond', 'request', 'comment', 'democrats', 'contended', 'naming', 'lone', 'individual', 'manage', 'government', 'relief', 'efforts', 'critical', 'particularly', 'given', 'federal', 'emergency', 'management', 'agency', 'already', 'stretched', 'thin', 'dealing', 'crises', 'aftermath', 'hurricane', 'harvey', 'texas', 'wildfires', 'california', 'severity', 'puerto', 'rico', 'crisis', 'million', 'people', 'clean', 'water', 'millions', 'without', 'power', 'nearly', 'month', 'hurricane', 'maria', 'made', 'landfall', 'demand', 'single', 'person', 'focus', 'exclusively', 'relief', 'recovery', 'democrats', 'said', 'forty', 'nine', 'people', 'died', 'puerto', 'rico', 'officially', 'dozens', 'missing', 'hurricane', 'extensive', 'damage', 'island', 'power', 'grid', 'destroying', 'homes', 'roads', 'vital', 'infrastructure', 'bankrupt', 'territory', 'struggling', 'provide', 'basic', 'services', 'like', 'running', 'water', 'pay', 'bills', 'tragically', 'clear', 'administration', 'caught', 'flat', 'footed', 'maria', 'hit', 'puerto', 'rico', 'said', 'velàzquez', 'appointing', 'ceo', 'response', 'recovery', 'last', 'put', 'one', 'person', 'authority', 'charge', 'manage', 'response', 'ensure', 'finally', 'getting', 'people', 'puerto', 'rico', 'aid', 'need', 'thursday', 'trump', 'said', 'federal', 'response', '10', 'scale', 'one', '10', 'meeting', 'puerto', 'rico', 'governor', 'ricardo', 'rossello', 'governor', 'asked', 'white', 'house', 'congress', 'least', '4', '6', 'billion', 'block', 'grants', 'types', 'funding', 'senator', 'marco', 'rubio', 'called', 'congress', 'modify', '18', '7', 'billion', 'aid', 'package', 'areas', 'damaged', 'recent', 'swath', 'hurricanes', 'ensure', 'puerto', 'rico', 'quickly', 'access', 'funds']",0
12,"['change', 'expected', 'espn', 'political', 'agenda', 'despite', 'huge', 'subscriber', 'decline', 'breitbart']","['sports', 'fans', 'turn', 'espn', 'protest', 'network', 'social', 'political', 'agenda', 'parent', 'company', 'disney', 'decision', 'extend', 'current', 'ceo', 'bob', 'iger', 'contract', '2019', 'means', 'world', 'leader', 'sports', 'entertainment', 'continue', 'merry', 'way', 'alienating', 'large', 'segment', 'americans', 'breitbart', 'reported', 'november', 'espn', 'lost', '3', '2', 'million', 'subscribers', 'year', 'moreover', 'last', 'years', 'sports', 'network', 'lost', '300', '000', 'subscribers', 'month', 'october', 'alone', 'beleaguered', 'network', 'lost', '621', '000', 'subscribers', 'decision', 'keep', 'iger', 'instead', 'moving', 'another', 'direction', 'promoting', 'coo', 'thomas', 'staggs', 'spot', 'number', 'substantial', 'business', 'implications', 'espn', 'awful', 'announcing', 'reports', 'outside', 'least', 'seems', 'like', 'iger', 'quite', 'satisfied', 'espn', 'leadership', 'network', 'path', 'fronts', 'specifically', 'regularly', 'spoken', 'important', 'espn', 'skinny', 'bundles', 'including', 'streaming', 'options', 'sling', 'directv', 'espn', 'forthcoming', 'offering', 'views', 'necessarily', 'shared', 'different', 'ceo', 'staggs', 'example', 'spoke', 'much', 'importance', 'traditional', 'bundle', 'disney', 'ceo', 'views', 'might', 'focus', 'side', 'conversely', 'another', 'disney', 'ceo', 'might', 'want', 'espn', 'quickly', 'go', 'larger', 'way', 'standalone', 'access', 'espn', 'content', 'including', 'tv', 'channels', 'rather', 'starting', 'light', 'approach', 'iger', 'seems', 'promoting', 'putting', 'circumstances', 'aside', 'crux', 'espn', 'continuing', 'political', 'social', 'agenda', 'derives', 'fact', 'iger', 'strident', 'lifelong', 'democrat', 'one', 'committed', 'fact', 'disney', 'stalwart', 'hillary', 'clinton', 'fundraiser', 'last', 'summer', 'along', 'supporters', 'entertainment', 'mogul', 'haim', 'saban', 'chernin', 'group', 'chairman', 'ceo', 'peter', 'chernin', 'entertainment', 'sports', 'executive', 'casey', 'wasserman', 'get', 'idea', 'iger', 'taste', 'social', 'issues', 'one', 'proposed', 'entertainment', 'ideas', 'filmic', 'version', 'girls', 'popular', 'television', 'show', 'starring', 'lena', 'dunham', 'somewhat', 'surprisingly', 'chagrin', 'many', 'supporters', 'shareholders', 'iger', 'serving', 'president', 'donald', 'trump', 'strategic', 'policy', 'forum', 'council', 'ceos', 'iger', 'agrees', 'trump', 'shutting', 'certain', 'tax', 'loopholes', 'strongly', 'supports', 'lowering', 'corporate', 'tax', 'rates', 'surprise', 'earlier', 'month', 'iger', 'reportedly', 'defended', 'participation', 'trump', 'mastermind', 'group', 'asserting', 'think', 'opportunity', 'express', 'views', 'think', 'value', 'company', 'shareholders', 'insisting', 'iger', 'informs', 'adversarial', 'view', 'administration', 'includes', 'immigration', 'yet', 'ceo', 'argues', 'claims', 'asserting', 'espn', 'politically', 'slanted', 'complete', 'exaggeration', 'said', 'looks', 'like', 'espn', 'cable', 'subscription', 'downward', 'spiral', 'persist', 'long', 'iger', 'remains', 'disney', 'top', 'executive']",0
13,"['billionaire', 'odebrecht', 'brazil', 'scandal', 'released', 'house', 'arrest']","['rio', 'de', 'janeiro', 'sao', 'paulo', 'reuters', 'billionaire', 'marcelo', 'odebrecht', 'highest', 'profile', 'executive', 'imprisoned', 'brazil', 'massive', 'graft', 'scandal', 'released', 'jail', 'tuesday', 'continue', 'sentence', 'corruption', 'house', 'arrest', 'according', 'federal', 'court', 'former', 'chief', 'executive', 'officer', 'odebrecht', 'sa', 'odbes', 'ul', 'latin', 'america', 'largest', 'construction', 'firm', 'arrested', '2015', 'investigation', 'dubbed', 'car', 'wash', 'exposed', 'billions', 'dollars', 'kickbacks', 'politicians', 'executives', 'state', 'run', 'companies', 'exchange', 'inflated', 'contracts', 'odebrecht', 'set', 'travel', 'sao', 'paulo', 'begin', 'house', 'arrest', 'electronic', 'surveillance', 'tuesday', 'according', 'federal', 'court', 'parana', 'representative', 'former', 'executive', 'said', 'remained', 'committed', 'collaborating', 'authorities', 'leniency', 'deal', 'odebrecht', 'first', 'sentenced', '19', 'years', 'prison', 'one', 'many', 'cases', 'related', 'car', 'wash', 'reduced', '10', 'years', 'signed', 'leniency', 'deal', 'last', 'december', 'exchange', 'paying', 'nearly', '2', 'billion', 'fine', 'admitting', 'guilt', 'providing', 'evidence', 'authorities', 'already', 'served', 'two', 'half', 'years', 'prison', 'deal', 'must', 'serve', 'another', 'two', 'half', 'years', 'house', 'arrest', 'permitted', 'leave', 'home', 'work', 'another', 'two', 'half', 'years', 'required', 'community', 'service', 'rest', '10', 'year', 'sentence', 'separately', 'tuesday', 'brazil', 'antitrust', 'watchdog', 'cade', 'said', 'investigating', 'two', 'alleged', 'cartels', 'involved', 'bidding', 'sao', 'paulo', 'infrastructure', 'projects', 'receiving', 'information', 'provided', 'odebrecht', 'executives']",0
15,"['u', 'n', 'seeks', 'humanitarian', 'pause', 'sanaa', 'streets', 'battlegrounds']","['geneva', 'reuters', 'united', 'nations', 'called', 'monday', 'humanitarian', 'pause', 'yemeni', 'capital', 'sanaa', 'tuesday', 'allow', 'civilians', 'leave', 'homes', 'aid', 'workers', 'reach', 'wounded', 'get', 'medical', 'care', 'jamie', 'mcgoldrick', 'u', 'n', 'humanitarian', 'coordinator', 'yemen', 'said', 'statement', 'streets', 'sanaa', 'become', 'battlegrounds', 'aid', 'workers', 'remain', 'lockdown', 'thus', 'call', 'parties', 'conflict', 'urgently', 'enable', 'humanitarian', 'pause', 'tuesday', '5', 'december', '10', '00', '16', '00', 'p', 'allow', 'civilians', 'leave', 'homes', 'seek', 'assistance', 'protection', 'facilitate', 'movement', 'aid', 'workers', 'ensure', 'continuity', 'life', 'saving', 'programs', 'said', 'mcgoldrick', 'warned', 'warring', 'parties', 'deliberate', 'attacks', 'civilians', 'civilian', 'medical', 'infrastructure', 'clear', 'violations', 'international', 'humanitarian', 'law', 'may', 'constitute', 'war', 'crimes']",0
17,"['second', 'judge', 'says', 'clinton', 'email', 'setup', 'may', 'bad', 'faith']","['new', 'york', 'reuters', 'second', 'federal', 'judge', 'taken', 'rare', 'step', 'allowing', 'group', 'suing', 'records', 'hillary', 'clinton', 'time', 'u', 'secretary', 'state', 'seek', 'sworn', 'testimony', 'officials', 'saying', 'evidence', 'government', 'wrong', 'bad', 'faith', 'language', 'judge', 'royce', 'lamberth', 'order', 'undercut', 'democratic', 'presidential', 'contender', 'assertion', 'allowed', 'set', 'private', 'email', 'server', 'home', 'work', 'country', 'top', 'diplomat', 'arrangement', 'particularly', 'unusual', 'described', 'clinton', 'email', 'arrangement', 'extraordinary', 'order', 'filed', 'tuesday', 'federal', 'district', 'court', 'washington', 'referring', 'state', 'department', 'clinton', 'clinton', 'aides', 'said', 'constantly', 'shifting', 'admissions', 'government', 'former', 'government', 'officials', 'spokesmen', 'clinton', 'immediately', 'respond', 'request', 'comment', 'case', 'civil', 'matter', 'order', 'adds', 'legal', 'uncertainty', 'overshadowed', 'clinton', 'campaign', 'democratic', 'nominee', 'nov', '8', 'presidential', 'election', 'fbi', 'also', 'conducting', 'criminal', 'inquiry', 'arrangement', 'emerged', 'classified', 'government', 'secrets', 'ended', 'clinton', 'unsecured', 'email', 'account', 'clinton', 'said', 'think', 'charged', 'crime', 'lamberth', 'order', 'granted', 'request', 'judicial', 'watch', 'conservative', 'watchdog', 'group', 'suing', 'department', 'open', 'records', 'laws', 'gather', 'evidence', 'including', 'sworn', 'testimony', 'group', 'filed', 'several', 'lawsuits', 'including', 'one', 'seeking', 'records', '2012', 'attack', 'benghazi', 'libya', 'killed', 'u', 'ambassador', 'christopher', 'stevens', 'three', 'americans', 'evidence', 'government', 'wrong', 'bad', 'faith', 'limited', 'discovery', 'appropriate', 'even', 'though', 'exceedingly', 'rare', 'foia', 'freedom', 'information', 'cases', 'lamberth', 'noted', 'order', 'government', 'normally', 'given', 'benefit', 'doubt', 'properly', 'searched', 'produced', 'records', 'since', 'email', 'arrangement', 'came', 'public', 'knowledge', 'year', 'ago', 'state', 'department', 'found', 'defending', 'clinton', 'scores', 'lawsuits', 'groups', 'individuals', 'news', 'outlets', 'say', 'wrongly', 'denied', 'access', 'clinton', 'federal', 'records', 'clinton', 'left', 'department', '2013', 'return', 'email', 'records', 'government', 'nearly', 'two', 'years', 'later', 'last', 'month', 'judge', 'emmet', 'sullivan', 'overseeing', 'separate', 'judicial', 'watch', 'lawsuit', 'clinton', 'related', 'records', 'allowed', 'similar', 'motion', 'discovery', 'story', 'refiles', 'fix', 'date', 'presidential', 'election', 'paragraph', '6']",0
24,"['supreme', 'court', 'hear', 'appeal', 'texas', 'voter', 'id', 'case', 'new', 'york', 'times']","['washington', '—', 'supreme', 'court', 'rejected', 'monday', 'appeal', 'texas', 'officials', 'seeking', 'restore', 'state', 'strict', 'voter', 'id', 'law', 'court', 'custom', 'brief', 'order', 'case', 'abbott', 'v', 'veasey', 'gave', 'reasons', 'turning', 'appeal', 'chief', 'justice', 'john', 'g', 'roberts', 'jr', 'issued', 'unusual', 'statement', 'explaining', 'supreme', 'court', 'remains', 'free', 'consider', 'case', 'proceedings', 'lower', 'courts', 'texas', 'law', 'enacted', '2011', 'requires', 'voters', 'seeking', 'cast', 'ballots', 'polls', 'present', 'photo', 'identification', 'like', 'texas', 'driver', 'gun', 'license', 'military', 'id', 'passport', 'federal', 'courts', 'repeatedly', 'ruled', 'law', 'racially', 'discriminatory', 'texas', 'law', 'first', 'blocked', 'section', '5', 'federal', 'voting', 'rights', 'act', 'required', 'states', 'localities', 'history', 'discrimination', 'obtain', 'federal', 'permission', 'changing', 'voting', 'procedures', 'supreme', 'court', 'effectively', 'struck', 'section', '5', '2013', 'shelby', 'county', 'v', 'holder', 'alabama', 'case', 'texas', 'officials', 'announced', 'would', 'start', 'enforcing', 'id', 'law', 'trial', '2014', 'judge', 'nelva', 'gonzales', 'ramos', 'federal', 'district', 'court', 'corpus', 'christi', 'struck', 'law', 'oct', '9', 'opinion', 'said', 'adopted', 'unconstitutional', 'discriminatory', 'purpose', 'effect', 'disenfranchise', 'disproportionate', 'number', 'hispanics', '2015', 'panel', 'united', 'states', 'court', 'appeals', 'fifth', 'circuit', 'new', 'orleans', 'vacated', 'part', 'ruling', 'concerning', 'law', 'purpose', 'affirmed', 'part', 'concerning', 'effect', 'last', 'july', 'full', 'fifth', 'circuit', 'largely', 'adopted', 'panel', 'distinctions', 'reasoning', 'returned', 'case', 'trial', 'court', 'consider', 'appropriate', 'remedy', 'texas', 'officials', 'nonetheless', 'asked', 'supreme', 'court', 'review', 'appeals', 'court', 'ruling', 'immediately', 'people', 'groups', 'challenging', 'law', 'officials', 'said', 'presented', 'evidence', 'law', 'resulted', 'diminished', 'minority', 'political', 'participation', 'prevented', 'even', 'single', 'person', 'voting', 'challengers', 'responded', '600', '000', 'registered', 'voters', 'texas', '5', 'percent', 'registered', 'voters', 'state', 'lacked', 'required', 'forms', 'id', 'law', 'said', 'disproportionately', 'affects', 'minority', 'voters', 'statistically', 'significant', 'rate', 'monday', 'chief', 'justice', 'roberts', 'wrote', 'case', 'would', 'better', 'suited', 'supreme', 'court', 'review', 'lower', 'courts', 'completed', 'work', 'ken', 'paxton', 'texas', 'attorney', 'general', 'said', 'disappointed', 'court', 'immediately', 'take', 'case', 'chief', 'justice', 'roberts', 'made', 'clear', 'case', 'even', 'stronger', 'posture', 'supreme', 'court', 'review', 'proceedings', 'lower', 'courts', 'mr', 'paxton', 'said', 'statement', 'texas', 'enacted', 'common', 'sense', 'voter', 'id', 'law', 'safeguard', 'integrity', 'elections', 'continue', 'fight', 'law', 'j', 'gerald', 'hebert', 'campaign', 'legal', 'center', 'represents', 'voters', 'challenging', 'law', 'said', 'pleased', 'justices', 'recognize', 'case', 'merit', 'review', 'time', 'texas', 'ranks', 'poorly', 'voter', 'participation', 'work', 'ensure', 'every', 'eligible', 'voter', 'state', 'able', 'cast', 'ballot', 'going', 'forward', 'mr', 'hebert', 'said', 'statement']",0
25,"['islamic', 'state', 'driven', 'last', 'stronghold', 'northern', 'iraq']","['baghdad', 'reuters', 'iraqi', 'forces', 'announced', 'thursday', 'captured', 'islamic', 'state', 'last', 'stronghold', 'northern', 'iraq', 'leaving', 'militant', 'group', 'holed', 'near', 'syrian', 'border', 'self', 'proclaimed', 'caliphate', 'shrinks', 'town', 'hawija', 'surrounding', 'areas', 'fell', 'offensive', 'u', 'backed', 'iraqi', 'government', 'troops', 'iranian', 'trained', 'armed', 'shi', 'ite', 'paramilitary', 'groups', 'known', 'popular', 'mobilisation', 'fighting', 'took', 'place', 'north', 'east', 'town', 'militants', 'surrounded', 'fall', 'hawija', 'lies', 'near', 'kurdish', 'held', 'oil', 'city', 'kirkuk', 'area', 'remains', 'control', 'islamic', 'state', 'iraq', 'stretch', 'alongside', 'western', 'border', 'syria', 'militant', 'group', 'also', 'retreat', 'capture', 'hawija', 'boost', 'iraqi', 'government', 'faces', 'separate', 'crisis', 'north', 'country', 'kurdish', 'minority', 'last', 'month', 'voted', 'overwhelmingly', 'support', 'independence', 'autonomous', 'region', 'hawija', 'state', 'tv', 'showed', 'footage', 'iraqi', 'forces', 'raising', 'flags', 'one', 'town', 'squares', 'humvees', 'patrolled', 'empty', 'streets', 'littered', 'car', 'wrecks', 'houses', 'riddled', 'bullets', 'shattered', 'storefronts', 'graphics', 'iraqi', 'kurds', 'independence', 'push', 'army', '9th', 'armored', 'division', 'federal', 'police', 'emergency', 'response', 'division', 'popular', 'mobilisation', 'liberated', 'hawija', 'said', 'statement', 'joint', 'operations', 'commander', 'lieutenant', 'general', 'abdul', 'ameer', 'rasheed', 'yarallah', 'thick', 'black', 'smoke', 'rose', 'areas', 'surrounding', 'hawija', 'oil', 'wells', 'set', 'fire', 'militants', 'prevent', 'air', 'detection', 'capture', 'hawija', 'brings', 'iraqi', 'forces', 'direct', 'contact', 'kurdish', 'peshmerga', 'fighters', 'control', 'kirkuk', 'multi', 'ethnic', 'region', 'claimed', 'baghdad', 'kurdistan', 'regional', 'government', 'krg', 'kirkuk', 'shaped', 'flashpoint', 'last', 'month', 'krg', 'included', 'city', 'referendum', 'kurdish', 'independence', 'northern', 'iraq', 'want', 'aggression', 'confrontations', 'federal', 'authority', 'must', 'imposed', 'disputed', 'areas', 'iraqi', 'prime', 'minister', 'haider', 'al', 'abadi', 'told', 'news', 'conference', 'paris', 'french', 'president', 'emmanuel', 'macron', 'macron', 'offered', 'mediate', 'iraqi', 'government', 'kurdish', 'regional', 'authorities', 'promised', 'maintain', 'military', 'presence', 'islamic', 'state', 'defeated', 'iraqi', 'government', 'imposed', 'sanctions', 'response', 'independence', 'vote', 'abadi', 'said', 'could', 'understand', 'kurds', 'aspirations', 'provided', 'expressed', 'accordance', 'constitution', 'outcome', 'referendum', 'raised', 'concerns', 'iraq', 'abroad', 'conflict', 'might', 'break', 'along', 'ethnic', 'lines', 'potentially', 'weakening', 'campaign', 'islamic', 'state', 'turkey', 'battling', 'three', 'decade', 'old', 'insurgency', 'kurdish', 'militants', 'led', 'regional', 'opposition', 'iraqi', 'kurds', 'independence', 'hopes', 'stepping', 'rhetoric', 'issue', 'president', 'tayyip', 'erdogan', 'said', 'turkey', 'would', 'soon', 'close', 'border', 'northern', 'iraq', 'shut', 'air', 'space', 'response', 'referendum', 'added', 'turkey', 'iran', 'iraq', 'would', 'jointly', 'decide', 'closing', 'flow', 'oil', 'northern', 'iraq', 'vital', 'source', 'earnings', 'iraqi', 'kurdistan', 'wednesday', 'russian', 'president', 'vladimir', 'putin', 'whose', 'country', 'big', 'energy', 'interests', 'kurdistan', 'said', 'one', 'interest', 'cut', 'oil', 'supplies', 'territory', 'erdogan', 'said', 'decision', 'made', 'closing', 'oil', 'taps', 'region', 'made', 'us', 'turkey', 'iran', 'iraq', 'central', 'government', 'together', 'erdogan', 'stance', 'backed', 'iran', 'meeting', 'erdogan', 'tehran', 'wednesday', 'supreme', 'leader', 'ayatollah', 'ali', 'khamenei', 'said', 'iran', 'turkey', 'prevent', 'iraqi', 'kurds', 'declaring', 'independence', 'relations', 'cool', 'shi', 'ite', 'iran', 'mainly', 'sunni', 'turkey', 'alarmed', 'independence', 'vote', 'fearing', 'encourage', 'separatism', 'among', 'kurdish', 'populations', 'kurds', 'region', 'fourth', 'largest', 'ethnic', 'group', 'spread', 'across', 'iran', 'turkey', 'syria', 'iraq', 'oppose', 'moves', 'towards', 'kurdish', 'state', 'iraq', 'launched', 'offensive', 'sept', '21', 'dislodge', 'islamic', 'state', 'hawija', 'area', '78', '000', 'people', 'estimated', 'trapped', 'according', 'united', 'nations', 'militants', 'control', 'border', 'town', 'al', 'qaim', 'region', 'surrounding', 'also', 'hold', 'parts', 'syrian', 'side', 'border', 'area', 'control', 'shrinking', 'retreat', 'face', 'two', 'different', 'sets', 'hostile', 'forces', 'u', 'backed', 'kurdish', 'led', 'coalition', 'syrian', 'government', 'troops', 'foreign', 'shi', 'ite', 'militias', 'backed', 'iran', 'russia', 'islamic', 'state', 'cross', 'border', 'caliphate', 'effectively', 'collapsed', 'july', 'u', 'backed', 'iraqi', 'forces', 'captured', 'mosul', 'group', 'de', 'facto', 'capital', 'iraq', 'grueling', 'battle', 'lasted', 'nine', 'months', 'militants', 'leader', 'abu', 'bakr', 'al', 'baghdadi', 'declared', 'caliphate', 'mosul', 'mid', '2014', 'released', 'audio', 'recording', 'last', 'week', 'indicated', 'alive', 'several', 'reports', 'killed', 'urged', 'followers', 'keep', 'fight', 'despite', 'setbacks']",0
26,"['senators', 'propose', 'giving', 'states', 'option', 'keep', 'affordable', 'care', 'act', 'new', 'york', 'times']","['washington', '—', 'several', 'republican', 'senators', 'monday', 'proposed', 'partial', 'replacement', 'affordable', 'care', 'act', 'would', 'allow', 'states', 'continue', 'operating', 'law', 'choose', 'proposal', 'meant', 'appeal', 'critics', 'supporters', 'former', 'president', 'barack', 'obama', 'signature', 'health', 'law', 'plan', 'attacked', 'democrats', 'step', 'back', 'affordable', 'care', 'act', 'protections', 'unlikely', 'win', 'acceptance', 'conservative', 'republicans', 'want', 'get', 'rid', 'law', 'tax', 'increases', 'soon', 'possible', 'anything', 'proposal', '—', 'senators', 'bill', 'cassidy', 'louisiana', 'medical', 'doctor', 'susan', 'collins', 'maine', 'moderate', 'republican', '—', 'may', 'show', 'difficult', 'republicans', 'enact', 'replacement', 'affordable', 'care', 'act', 'legislation', 'pass', 'muster', 'conservative', 'house', 'may', 'win', 'enough', 'support', 'senate', 'bill', 'broad', 'appeal', 'senate', 'may', 'fail', 'house', 'proposal', 'states', 'could', 'stay', 'affordable', 'care', 'act', 'could', 'receive', 'similar', 'amount', 'federal', 'money', 'consumers', 'could', 'use', 'pay', 'medical', 'care', 'health', 'insurance', 'moving', 'locus', 'repeal', 'state', 'government', 'mr', 'cassidy', 'said', 'states', 'right', 'choose', 'proposal', 'shares', 'features', 'house', 'republican', 'proposals', 'would', 'encourage', 'greater', 'use', 'health', 'savings', 'accounts', 'eliminate', 'requirement', 'americans', 'insurance', 'pay', 'tax', 'penalty', 'option', 'states', 'keep', 'affordable', 'care', 'act', 'alive', 'rankle', 'conservative', 'republicans', 'trying', 'nearly', 'seven', 'years', 'blow', 'obamacare', 'flawed', 'failing', 'fixable', 'needs', 'fully', 'repealed', 'said', 'representative', 'mark', 'meadows', 'north', 'carolina', 'chairman', 'house', 'freedom', 'caucus', 'stalemate', 'house', 'senate', 'would', 'leave', 'place', 'mr', 'obama', 'health', 'law', 'efforts', 'president', 'trump', 'congress', 'undermine', 'could', 'send', 'health', 'insurance', 'markets', 'tailspin', 'friday', 'one', 'first', 'official', 'acts', 'president', 'mr', 'trump', 'signed', 'executive', 'order', 'could', 'allow', 'officials', 'ease', 'enforcement', 'mandate', 'requiring', 'americans', 'insurance', 'supporters', 'affordable', 'care', 'act', 'panned', 'proposal', 'millions', 'americans', 'would', 'kicked', 'plans', 'costs', 'deductibles', 'consumers', 'would', 'skyrocket', 'protections', 'people', 'conditions', 'cancer', 'would', 'gutted', 'said', 'senate', 'democratic', 'leader', 'chuck', 'schumer', 'new', 'york', 'ronald', 'f', 'pollack', 'executive', 'director', 'families', 'usa', 'consumer', 'group', 'said', 'bill', 'falls', 'way', 'short', 'providing', 'protections', 'coverage', 'people', 'affordable', 'care', 'act', 'ms', 'collins', 'said', 'bill', 'would', 'allow', 'states', 'keep', 'affordable', 'care', 'act', 'working', 'residents', 'predicted', 'states', 'would', 'choose', 'something', 'different', 'bill', 'states', 'could', 'enroll', 'people', 'would', 'otherwise', 'uninsured', 'health', 'plans', 'providing', 'basic', 'coverage', 'health', 'plans', 'intended', 'protect', 'consumers', 'catastrophic', 'medical', 'expenses', 'would', 'cover', 'generic', 'versions', 'prescription', 'drugs', 'would', 'also', 'cover', 'recommended', 'childhood', 'immunizations', 'without', 'states', 'would', 'contract', 'one', 'insurers', 'offer', 'coverage', 'consumers', 'could', 'buy', 'robust', 'coverage', 'want', 'mr', 'cassidy', 'said', 'could', 'automatically', 'enrolled', 'default', 'health', 'plans', 'providing', 'basic', 'coverage', 'state', 'could', 'say', 'eligible', 'enrolled', 'unless', 'choose', 'explained', 'passive', 'enrollment', 'would', 'provide', 'insurers', 'large', 'pool', 'customers', 'including', 'many', 'healthy', 'people', 'without', 'coercion', 'individual', 'mandate', 'mr', 'cassidy', 'said', 'think', 'could', 'cover', 'people', 'obamacare', 'mr', 'cassidy', 'said', 'although', 'acknowledged', 'effects', 'bill', 'analyzed', 'congressional', 'budget', 'office', 'serves', 'capitol', 'hill', 'official', 'scorekeeper', 'state', 'opts', 'affordable', 'care', 'act', 'many', 'federal', 'insurance', 'standards', 'established', 'law', 'would', 'longer', 'apply', 'bill', 'would', 'repeal', 'federal', 'benefit', 'mandates', 'often', 'force', 'americans', 'pay', 'coverage', 'need', 'afford', 'mr', 'cassidy', 'said', 'protections', 'would', 'remain', 'place', 'parents', 'would', 'still', 'allowed', 'keep', 'children', 'insurance', 'age', '26', 'insurers', 'could', 'impose', 'annual', 'lifetime', 'limits', 'benefits', 'bill', 'called', 'patient', 'freedom', 'act', 'would', 'eliminate', 'unpopular', 'individual', 'mandate', 'also', 'federal', 'requirement', 'larger', 'employers', 'offer', 'coverage', 'employees', 'mr', 'cassidy', 'said', 'senators', 'shelley', 'moore', 'capito', 'west', 'virginia', 'johnny', 'isakson', 'georgia', 'republicans', 'also', 'sponsors', 'bill', 'senate', 'majority', 'leader', 'mitch', 'mcconnell', 'republican', 'kentucky', '2', 'senate', 'republican', 'john', 'cornyn', 'texas', 'sponsors', 'similar', 'bill', 'mr', 'cassidy', 'introduced', '2015', 'legislative', 'landscape', 'different', 'republicans', 'congress', 'repeal', 'affordable', 'care', 'act', 'support', 'mr', 'trump', 'senate', 'need', 'help', 'democrats', 'adopt', 'replacement', 'republicans', 'eight', 'votes', 'shy', '60', 'needed', 'stop', 'filibuster']",0


Unnamed: 0,title,text,label
0,"['law', 'enforcement', 'high', 'alert', 'following', 'threats', 'cops', 'whites', '9', '11by', 'blacklivesmatter', 'fyf911', 'terrorists', 'video']","['comment', 'expected', 'barack', 'obama', 'members', 'fyf911', 'fukyoflag', 'blacklivesmatter', 'movements', 'called', 'lynching', 'hanging', 'white', 'people', 'cops', 'encouraged', 'others', 'radio', 'show', 'tuesday', 'night', 'turn', 'tide', 'kill', 'white', 'people', 'cops', 'send', 'message', 'killing', 'black', 'people', 'america', 'one', 'f', 'yoflag', 'organizers', 'called', 'sunshine', 'radio', 'blog', 'show', 'hosted', 'texas', 'called', 'sunshine', 'f', 'ing', 'opinion', 'radio', 'show', 'snapshot', 'fyf911', 'lolatwhitefear', 'twitter', 'page', '9', '53', 'p', 'shows', 'urging', 'supporters', 'call', 'fyf911', 'tonight', 'continue', 'dismantle', 'illusion', 'white', 'snapshot', 'twitter', 'radio', 'call', 'invite', 'fyf911the', 'radio', 'show', 'aired', '10', '00', 'p', 'eastern', 'standard', 'time', 'show', 'callers', 'clearly', 'call', 'lynching', 'killing', 'white', 'people', '2', '39', 'minute', 'clip', 'radio', 'show', 'heard', 'provided', 'breitbart', 'texas', 'someone', 'would', 'like', 'referred', 'hannibal', 'already', 'received', 'death', 'threats', 'result', 'interrupting', 'fyf911', 'conference', 'calls', 'unidentified', 'black', 'man', 'said', 'mother', 'f', 'kers', 'start', 'f', 'ing', 'like', 'us', 'bunch', 'ni', 'ers', 'takin', 'one', 'us', 'roll', 'said', 'cause', 'already', 'roll', 'gangs', 'anyway', 'six', 'seven', 'black', 'mother', 'f', 'ckers', 'see', 'white', 'person', 'lynch', 'ass', 'let', 'turn', 'tables', 'conspired', 'cops', 'started', 'losing', 'people', 'state', 'emergency', 'speculated', 'one', 'two', 'things', 'would', 'happen', 'big', 'ass', 'r', 'war', 'ni', 'ers', 'going', 'start', 'backin', 'already', 'getting', 'killed', 'f', 'k', 'got', 'lose', 'sunshine', 'could', 'heard', 'saying', 'yep', 'true', 'f', 'king', 'true', 'said', 'need', 'turn', 'tables', 'kids', 'getting', 'shot', 'somebody', 'needs', 'become', 'sacrifice', 'side', 'said', 'everybody', 'whatever', 'like', 'say', 'everybody', 'different', 'position', 'war', 'continued', 'give', 'f', 'k', 'anyway', 'said', 'might', 'well', 'utilized', 'turn', 'tables', 'n', 'ers', 'said', 'way', 'start', 'lookin', 'like', 'havin', 'many', 'casualties', 'causalities', 'side', 'instead', 'killing', 'black', 'people', 'black', 'lives', 'matter', 'mother', 'f', 'kers', 'got', 'make', 'matter', 'find', 'mother', 'f', 'ker', 'alone', 'snap', 'ass', 'f', 'hang', 'damn', 'tree', 'take', 'picture', 'send', 'mother', 'f', 'kers', 'need', 'one', 'example', 'people', 'start', 'watchin', 'turn', 'tables', 'said', 'said', 'start', 'trickle', 'effect', 'said', 'one', 'white', 'person', 'hung', 'flat', 'hanging', 'start', 'trickle', 'effect', 'continued', 'black', 'people', 'good', 'starting', 'trends', 'said', 'get', 'upper', 'hand', 'another', 'black', 'man', 'spoke', 'saying', 'needed', 'kill', 'cops', 'killing', 'us', 'first', 'black', 'male', 'said', 'best', 'method', 'right', 'breitbart', 'texas', 'previously', 'reported', 'sunshine', 'upset', 'racist', 'white', 'people', 'infiltrated', 'disrupted', 'one', 'conference', 'calls', 'subsequently', 'released', 'phone', 'number', 'one', 'infiltrators', 'veteran', 'immediately', 'started', 'receiving', 'threatening', 'calls', 'one', 'f', 'yoflag', 'movement', 'supporters', 'allegedly', 'told', 'veteran', 'infiltrated', 'publicly', 'posted', 'conference', 'call', 'going', 'rape', 'gut', 'pregnant', 'wife', 'f', 'ing', 'piece', 'sh', 'unborn', 'creature', 'hung', 'tree', 'breitbart', 'texas', 'previously', 'encountered', 'sunshine', 'sandra', 'bland', 'protest', 'waller', 'county', 'jail', 'texas', 'said', 'white', 'people', 'killed', 'told', 'journalists', 'photographers', 'see', 'nappy', 'ass', 'hair', 'head', 'means', 'one', 'militant', 'negroes', 'said', 'protest', 'redneck', 'mother', 'f', 'kers', 'murdered', 'sandra', 'bland', 'nappy', 'hair', 'like', 'fyf911', 'black', 'radicals', 'say', 'holding', 'imperial', 'powers', 'actually', 'responsible', 'terrorist', 'attacks', 'september', '11th', 'accountable', 'day', 'reported', 'breitbart', 'texas', 'several', 'websites', 'twitter', 'handles', 'movement', 'palmetto', 'star', 'describes', 'one', 'head', 'organizers', 'said', 'youtube', 'video', 'supporters', 'burning', 'symbols', 'illusion', 'superiority', 'false', 'white', 'supremacy', 'like', 'american', 'flag', 'british', 'flag', 'police', 'uniforms', 'ku', 'klux', 'klan', 'hoods', 'sierra', 'mcgrone', 'nocturnus', 'libertus', 'posted', 'help', 'young', 'afrikan', 'clean', 'rag', 'oppression', 'posted', 'two', 'photos', 'one', 'appears', 'photo', 'black', 'man', 'wiping', 'naked', 'butts', 'american', 'flag', 'entire', 'story', 'breitbart', 'news']",1
1,"['unbelievable', 'obama', 'attorney', 'general', 'says', 'charlotte', 'rioters', 'peaceful', 'protesters', 'home', 'state', 'north', 'carolina', 'video']","['demonstrators', 'gathered', 'last', 'night', 'exercising', 'constitutional', 'protected', 'right', 'peaceful', 'protest', 'order', 'raise', 'issues', 'create', 'change', 'loretta', 'lynch', 'aka', 'eric', 'holder', 'skirt']",1
3,"['satan', '2', 'russia', 'unvelis', 'image', 'terrifying', 'new', 'supernuke', '–', 'western', 'world', 'takes', 'notice']","['rs', '28', 'sarmat', 'missile', 'dubbed', 'satan', '2', 'replace', 'ss', '18', 'flies', '4', '3', 'miles', '7km', 'per', 'sec', 'range', '6', '213', 'miles', '10', '000km', 'weapons', 'perceived', 'part', 'increasingly', 'aggressive', 'russia', 'could', 'deliver', 'warhead', '40', 'megatons', '–', '2', '000', 'times', 'powerful', 'atom', 'bombs', 'dropped', 'hiroshima', 'nagasaki', '1945', 'libby', 'plummer', 'gareth', 'davie', 'russia', 'unveiled', 'chilling', 'pictures', 'largest', 'ever', 'nuclear', 'missile', 'capable', 'destroying', 'area', 'size', 'france', 'rs', '28', 'sarmat', 'missile', 'dubbed', 'satan', '2', 'nato', 'top', 'speed', '4', '3', 'miles', '7km', 'per', 'second', 'designed', 'outfox', 'anti', 'missile', 'shield', 'systems', 'new', 'sarmat', 'missile', 'could', 'deliver', 'warheads', '40', 'megatons', '–', '2', '000', 'times', 'powerful', 'atom', 'bombs', 'dropped', 'hiroshima', 'nagasaki', '1945', 'scroll', 'video', 'russian', 'president', 'vladimir', 'putin', 'reportedly', 'planning', 'replace', 'country', 'older', 'ss', '18', 'satan', 'weapons', 'new', 'missiles', 'amid', 'string', 'recent', 'disagreements', 'west', 'kremlin', 'stepped', 'rhetoric', 'west', 'carried', 'series', 'manoeuvres', 'infuriated', 'politicians', 'us', 'uk', 'pictures', 'revealed', 'online', 'chief', 'designers', 'makeyev', 'rocket', 'design', 'bureau', 'message', 'posted', 'alongside', 'picture', 'said', 'accordance', 'decree', 'russian', 'government', 'state', 'defense', 'order', '2010', 'planning', 'period', '2012', '2013', 'makeyev', 'rocket', 'design', 'bureau', 'instructed', 'start', 'design', 'development', 'work', 'sarmat', 'rs', '28', 'sarmat', 'missile', 'said', 'contain', '16', 'nuclear', 'warheads', 'capable', 'destroying', 'area', 'size', 'france', 'texas', 'according', 'russian', 'news', 'network', 'zvezda', 'owned', 'russia', 'ministry', 'defence', 'weapon', 'also', 'able', 'evade', 'radar', 'expected', 'range', '6', '213', 'miles', '10', '000', 'km', 'would', 'allow', 'moscow', 'attack', 'london', 'entire', 'article', 'click', 'link']",1
4,"['time', 'christian', 'group', 'sues', 'amazon', 'splc', 'designation', 'hate', 'group']","['say', 'one', 'time', 'someone', 'sued', 'southern', 'poverty', 'law', 'center', 'tuesday', 'james', 'kennedy', 'ministries', 'djkm', 'filed', 'lawsuit', 'southern', 'poverty', 'law', 'center', 'splc', 'charity', 'navigation', 'organization', 'guidestar', 'amazon', 'defamation', 'religious', 'discrimination', 'trafficking', 'falsehood', 'splc', 'listed', 'djkm', 'hate', 'group', 'guidestar', 'also', 'categorized', 'terms', 'amazon', 'kept', 'ministry', 'charity', 'donation', 'program', 'amazon', 'smile', 'embarked', 'today', 'journey', 'right', 'terrible', 'wrong', 'dr', 'frank', 'wright', 'president', 'ceo', 'djkm', 'said', 'statement', 'tuesday', 'knowingly', 'label', 'christian', 'ministries', 'hate', 'groups', 'solely', 'subscribing', 'historic', 'christian', 'faith', 'either', 'woefully', 'uninformed', 'willfully', 'deceitful', 'case', 'southern', 'poverty', 'law', 'center', 'lawsuit', 'alleges', 'latter', 'splc', 'labeled', 'djkm', 'anti', 'lgbt', 'hate', 'group', 'opposition', 'sex', 'marriage', 'transgenderism', 'false', 'illegal', 'characterizations', 'chilling', 'effect', 'free', 'exercise', 'religion', 'religious', 'free', 'speech', 'people', 'faith', 'wright', 'declared', 'given', 'splc', 'opportunity', 'retract', 'undertaken', 'legal', 'action', 'seeking', 'trial', 'jury', 'peers', 'preserve', 'rights', 'law', 'defend', 'religious', 'free', 'speech', 'rights', 'americans', 'djkm', 'president', 'concluded', 'lawsuit', 'laid', 'charges', 'splc', 'guidestar', 'amazon', 'read', 'pjm']",1
5,"['dr', 'ben', 'carson', 'targeted', 'irs', 'never', 'audit', 'spoke', 'national', 'prayer', 'breakfast']","['dr', 'ben', 'carson', 'tells', 'story', 'happened', 'spoke', 'obama']",1
6,"['sports', 'bar', 'owner', 'bans', 'nfl', 'games', 'show', 'true', 'american', 'sports', 'like', 'speak', 'rural', 'america', 'video']","['owner', 'ringling', 'bar', 'located', 'south', 'white', 'sulphur', 'springs', 'standing', 'behind', 'facebook', 'post', 'criticizes', 'nfl', 'players', 'take', 'knee', 'national', 'anthem', 'protest', 'police', 'brutality', 'post', 'made', 'ringling', 'bar', 'facebook', 'page', 'tuesday', 'night', 'since', 'received', 'hundreds', 'comments', 'shares', 'post', 'reads', 'ringling', 'bar', 'longer', 'show', 'nfl', 'games', 'allow', 'us', 'air', 'pbr', 'rodeo', 'nascar', 'events', 'whose', 'competitors', 'true', 'americans', 'sorry', 'inconvenience', 'ringling', 'bar', 'co', 'owner', 'kurt', 'bekemans', 'grew', 'paradise', 'valley', 'published', 'post', 'said', 'care', 'post', 'turns', 'customers', 'away', 'seriously', 'would', 'care', 'non', 'americans', 'patronize', 'place', 'said', 'bekemans', 'like', 'speak', 'rural', 'america', 'bet', 'see', 'farmers', 'ranchers', 'whole', 'country', 'take', 'knee', 'guys', 'bet', 'find', 'one', 'appreciate', 'great', 'nation', 'given', 'think', 'least', 'could', 'give', 'thanks', 'country', 'stand', 'flag', 'anthem', 'wednesday', 'morning', 'majority', 'comments', 'support', 'bar', 'love', 'ya', 'many', 'people', 'round', 'said', 'one', 'commenter', 'another', 'person', 'critical', 'writing', 'ha', 'ha', 'course', 'nascar', 'protesting', 'treatment', 'minority', 'care', 'read', 'krtv']",1
7,"['latest', 'pipeline', 'leak', 'underscores', 'dangers', 'dakota', 'access', 'pipeline']","['file', '–', 'sept', '15', '2005', 'file', 'photo', 'marker', 'welcomes', 'commuters', 'cushing', 'okla', 'seen', 'ap', 'photo', 'oklahoman', 'matt', 'strasen', 'file', 'underscoring', 'dangers', 'america', 'unreliable', 'fossil', 'fuel', 'infrastructure', 'significant', 'u', 'oil', 'pipeline', 'shut', 'leak', 'reported', 'monday', 'morning', 'enterprise', 'products', 'partners', 'said', 'monday', 'shut', 'seaway', 'crude', 'pipeline', '400', '000', 'barrel', 'per', 'day', 'conduit', 'transports', 'crude', 'oil', 'cushing', 'oklahoma', 'gulf', 'coast', 'refineries', 'leak', 'occurred', 'sunday', 'night', 'industrial', 'area', 'cushing', 'company', 'provide', 'estimate', 'volume', 'spilled', 'said', 'danger', 'public', 'seaway', 'personnel', 'continue', 'make', 'progress', 'cleaning', 'spill', 'substantially', 'contained', 'retention', 'pond', 'enbridge', 'facility', 'company', 'said', 'news', 'release', 'pdf', 'explaining', 'pipeline', '50', '50', 'joint', 'venture', 'enterprise', 'enbridge', 'inc', 'vacuum', 'trucks', 'used', 'recover', 'crude', 'oil', 'return', 'storage', 'tanks', 'site', 'impacted', 'segment', 'legacy', 'pipeline', 'capacity', '50', '000', 'barrels', 'release', 'added', 'however', 'actual', 'amount', 'crude', 'oil', 'released', 'significantly', 'less', 'determined', 'recovery', 'efforts', 'complete', 'incident', 'comes', 'another', 'pipeline', 'rupture', 'pennsylvania', 'early', 'friday', '55', '000', 'gallons', 'gasoline', 'poured', 'susquehanna', 'river', 'one', 'month', 'major', 'gasoline', 'pipeline', 'run', 'colonial', 'pipeline', 'co', 'halt', 'pumping', 'couple', 'weeks', 'due', 'spill', 'alabama', 'meanwhile', 'upi', 'reports', 'release', 'seaway', 'pipeline', 'second', 'associated', 'cushing', 'storage', 'hub', 'less', 'month', 'plains', 'american', 'pipeline', 'reported', 'problems', 'infrastructure', 'colorado', 'city', 'texas', 'cushing', 'earlier', 'month', 'environmentalists', 'indigenous', 'people', 'energy', 'companies', 'midst', 'heated', 'debate', 'pipeline', 'safety', 'water', 'protectors', 'allies', 'along', 'proposed', 'route', 'dakota', 'access', 'pipeline', 'dapl', 'saying', 'months', 'project', 'threatens', 'right', 'safe', 'drinking', 'water', 'oil', 'pipelines', 'break', 'spill', 'leak—it', 'question', 'question', '13', 'year', 'old', 'anna', 'lee', 'rain', 'yellowhammer', 'member', 'standing', 'rock', 'sioux', 'tribe', 'wrote', 'recent', 'appeal', 'high', 'chance', 'pipeline', 'leak', 'wrote', 'enbridge', 'backed', 'dapl', 'guess', 'oil', 'industry', 'keeps', 'pushing', 'care', 'health', 'safety', 'industry', 'seems', 'think', 'lives', 'expendable', 'others', 'indeed', 'referring', 'cushing', 'leak', 'one', 'observer', 'tweeted', 'monday', 'screaming', 'nodapl', 'always', 'break', 'seaway', 'crude', 'pipeline', 'system', 'shut', 'cushing', 'ok', 'spill', 'screaming', 'nodapl', 'always', 'break', 'https', 'co', 'oxilxcbrly', '—', 'deanna', 'rilling', 'deannarilling', 'october', '24', '2016']",1
8,"['gop', 'senator', 'smacked', 'punchable', 'alt', 'right', 'nazi', 'internet']","['punchable', 'alt', 'right', 'nazi', 'internet', 'got', 'thorough', 'beatdown', 'sen', 'ben', 'sasse', 'r', 'neb', 'twitter', 'epic', 'tweetstorm', 'richard', 'spencer', 'alt', 'right', 'leader', 'become', 'human', 'punching', 'bag', 'got', 'racism', 'smacked', 'republican', 'senator', 'thursday', 'white', 'nationalist', 'tweeted', 'goober', 'conservatives', 'blame', 'russia', 'racial', 'divisions', 'united', 'states', 'spencer', 'responding', 'tweet', 'sasse', 'sent', 'wednesday', 'sen', 'ben', 'sasse', 'shared', 'article', 'regarding', 'sen', 'james', 'lankford', 'r', 'okla', 'explained', 'russian', 'internet', 'trolls', 'helped', 'fuel', 'divisions', 'controversy', 'donald', 'trump', 'ignited', 'nfl', 'athletes', 'choose', 'kneel', 'rather', 'stand', 'national', 'anthem', 'protest', 'racial', 'inequality', 'police', 'brutality', 'one', 'loves', 'american', 'vs', 'american', 'fighting', 'putin', 'intel', 'agencies', 'stoke', 'sides', 'every', 'divide', 'https', 'co', 'h6bwjhzokh', 'ben', 'sasse', 'bensasse', 'september', '28', '2017spencer', 'responded', 'writing', 'minds', 'goober', 'conservatives', 'russians', 'blame', 'racial', 'divisions', 'minds', 'goober', 'conservatives', 'russians', 'blame', 'racial', 'divisions', 'https', 'co', 'czpgfl6u4m', 'richard', 'spencer', 'richardbspencer', 'september', '28', '2017sasse', 'tore', 'spencer', 'calling', 'clown', 'one', 'brown', 'shirt', 'pajama', 'boy', 'nazis', '1', 'oh', 'let', 'goobers', 'nongoobers', 'agree', 'racists', 'like', 'blame', 'putin', 'agencies', 'also', 'love', 'using', 'divisive', 'tool', 'https', 'co', 'dad4xanvi5', 'ben', 'sasse', 'bensasse', 'september', '28', '20172', 'get', 'wrong', 'always', 'brown', 'shirt', 'pajama', 'boy', 'nazis', 'like', 'lonely', 'pals', 'stoking', 'division', 'america', '101', 'https', 'co', 'sbovzmouu2', 'ben', 'sasse', 'bensasse', 'september', '28', '20173', 'get', 'america', 'said', 'human', 'right', 'abstract', 'thing', 'given', 'god', 'something', 'like', 'https', 'co', 'scxdgfcbgp', 'ben', 'sasse', 'bensasse', 'september', '28', '20174', 'actually', 'exactly', 'america', 'declares', 'people', 'image', 'bearers', 'god', 'created', 'dignity', 'inalienable', 'rights', 'https', 'co', 'd4orbrhjmw', 'ben', 'sasse', 'bensasse', 'september', '28', '20175', 'sadly', 'understand', 'human', 'dignity', 'person', 'skin', 'ancestry', 'bank', 'balance', 'nothing', 'intrinsic', 'value', 'https', 'co', '5jsyvakqrl', 'ben', 'sasse', 'bensasse', 'september', '28', '20176', 'declaration', 'universal', 'dignity', 'america', 'madison', 'called', 'constitution', 'greatest', 'reflection', 'human', 'nature', 'https', 'co', 'nqluvs1kva', 'ben', 'sasse', 'bensasse', 'september', '28', '20177', 'talk', 'culture', 'know', 'squat', 'western', 'heritage', 'sees', 'people', 'tribes', 'individuals', 'limitless', 'worth', 'https', 'co', 'vknvduxltt', 'ben', 'sasse', 'bensasse', 'september', '28', '20178', 'celebration', 'universal', 'dignity', 'culture', 'rejects', 'white', 'culture', 'crybaby', 'politics', 'rejects', 'identity', 'politics', 'https', 'co', 'adlj9avnpr', 'ben', 'sasse', 'bensasse', 'september', '28', '20179', 'sometime', 'moving', 'back', 'parents', 'basement', 'knock', 'nazis', 'fell', 'love', 'reheated', '20th', 'century', 'power', 'garbage', 'https', 'co', 'xdteatvgse', 'ben', 'sasse', 'bensasse', 'september', '28', '201710', 'ideas', 'hateful', 'un', 'american', 'poison', 'also', 'dang', 'boring', 'future', 'belong', 'stupid', 'memes', 'https', 'co', 'bnhslf1uox', 'ben', 'sasse', 'bensasse', 'september', '28', '201711', '11get', 'real', 'job', 'clown', 'find', 'actual', 'neighbor', 'serve', 'happier', 'nice', 'day', 'https', 'co', 'tored7vwdm', 'ben', 'sasse', 'bensasse', 'september', '28', '2017jake', 'tapper', 'weighed', 'say', 'wherein', 'piece', 'garbage', 'thrown', 'receptacle', 'grace', 'artistry', 'wherein', 'piece', 'garbage', 'thrown', 'receptacle', 'grace', 'artistry', 'https', 'co', 'l09bby8ghh', 'jake', 'tapper', 'jaketapper', 'september', '28', '2017this', 'donald', 'trump', 'responded', 'violent', 'rally', 'charlottesville', 'virginia', 'way', 'organized', 'richard', 'spencer', 'spencer', 'previously', 'said', 'felt', 'proud', 'trump', 'blamed', 'many', 'sides', 'violence', 'hate', 'rally', 'resulted', 'murder', 'heather', 'heyer', 'left', 'least', '19', 'others', 'injured', 'spencer', 'one', 'trump', 'fine', 'people', 'got', 'smacked', 'twitter', 'republican', 'like', 'politics', 'admire', 'sasse', 'among', 'republicans', 'joined', 'civil', 'rights', 'leaders', 'democrats', 'reacted', 'angrily', 'trump', 'said', 'condemned', 'egregious', 'display', 'hatred', 'bigotry', 'violence', 'many', 'sides', 'many', 'sides', 'course', 'sasse', 'would', 'call', 'roy', 'moore', 'bigoted', 'remarks', 'would', 'nice', 'see', 'photo', 'chip', 'somodevilla', 'via', 'getty', 'images']",1
11,"['watch', 'hilarious', 'ad', 'calls', 'question', 'health', 'aging', 'clinton', 'crime', 'family', 'bosses']","['watching', 'telling', 'video', 'wonder', 'instead', 'working', 'hard', 'get', 'back', 'white', 'house', 'hillary', 'time', 'would', 'better', 'spent', 'looking', 'assisted', 'living', 'situation', 'bill']",1
14,"['british', 'woman', 'loses', 'virginity', 'asylum', 'seeking', 'rapist', 'way', 'church']","['europe', 'likely', 'going', 'top', 'destination', 'families', 'young', 'daughters', 'one', 'blame', 'idiocy', 'political', 'correctness', 'death', 'europe', 'know', 'americans', 'fortitude', 'stop', 'bleeding', 'invasion', 'foreigners', 'soil', 'officially', 'control', 'predatory', 'asylum', 'seeker', 'chillingly', 'raped', '21', 'year', 'old', 'devout', 'christian', 'woman', 'next', 'church', 'starting', '10', 'year', 'jail', 'sentence', 'today', 'eritrean', 'born', 'mebrehtom', 'abrha', '25', 'stalked', 'vulnerable', 'virgin', '10', 'minutes', 'walked', 'liverpool', 'city', 'centre', 'boyfriend', 'house', '6am', 'last', 'july', '19', 'liverpool', 'crown', 'court', 'heard', 'dragged', 'pavement', 'grassy', 'area', 'raping', 'twice', 'harrowing', 'four', 'minute', 'ordeal', 'terrifying', 'attack', 'left', 'devout', 'christian', 'angry', 'god', 'fearing', 'contracted', 'hiv', 'appalled', 'judge', 'told', 'abrha', 'lived', 'liverpool', 'fleeing', 'birmingham', 'arrested', 'following', 'bbc', 'crimewatch', 'appeal', 'october', '12', 'today', 'started', 'extended', 'sentence', '10', 'years', 'imprisonment', 'five', 'years', 'licence', 'victim', 'told', 'court', 'felt', 'ashamed', 'dirty', 'unclean', 'going', 'church', 'always', 'important', 'part', 'life', 'since', 'attack', 'able', 'go', 'church', 'many', 'months', 'angry', 'god', 'angry', 'feeling', 'way', 'victim', 'also', 'admitted', 'attack', 'caused', 'end', 'relationship', 'boyfriend', 'adding', 'lost', 'desire', 'anything', 'life', 'feel', 'terrified', 'shower', 'get', 'feeling', 'someone', 'going', 'get', 'start', 'panic', 'victim', 'named', 'legal', 'reasons', 'clubbing', 'saturday', 'july', '18', 'last', 'year', 'agreed', 'meet', 'boyfriend', 'following', 'morning', 'prosecutor', 'david', 'mclachlan', 'told', 'court', 'could', 'get', 'taxi', 'opted', 'walk', 'mile', 'half', 'route', 'despite', 'warnings', 'cut', 'park', 'boyfriend', 'confronted', 'east', 'african', 'man', 'dragged', 'wooded', 'area', 'nearby', 'subjected', 'horrific', 'attack', 'ripping', 'open', 'dress', 'raping', 'chillingly', 'walking', 'away', 'afterwards', 'woman', 'covered', 'dirt', 'injuries', 'back', 'neck', 'ran', 'boyfriend', 'house', 'raised', 'alarm', 'michael', 'brien', 'defending', 'abhra', 'read', 'part', 'letter', 'apology', 'client', 'went', 'un', 'christian', 'act', 'horrible', 'thing', 'woman', 'ask', 'forgiveness', 'ahbra', 'granted', 'asylum', 'five', 'years', 'june', '2014', 'claiming', 'forced', 'flee', 'native', 'country', 'conscripted', 'eritrean', 'army', 'aged', '18', 'ahbra', 'previous', 'convictions', 'spoke', 'tigrignan', 'interpreter', 'claimed', 'memory', 'attack', 'inebriated', 'ahbra', 'held', 'hand', 'eyes', 'david', 'aubrey', 'qc', 'sentenced', 'made', 'sign', 'cross', 'led', 'custody', 'judge', 'aubrey', 'told', 'watched', 'followed', 'pursued', 'threatened', 'raped', 'somewhat', 'chillingly', 'walking', 'away', 'calmly', 'ignore', 'tragic', 'irony', 'case', 'attacked', 'next', 'church', 'afterwards', 'merseyside', 'police', 'det', 'insp', 'terry', 'davies', 'said', 'doubt', 'significant', 'impact', 'young', 'victim', 'live', 'rest', 'life', 'via', 'express', 'uk']",1


1000
4000
9000
10000
14000
16000
18000
19000
23000


## Plotting Results
### Compare Scores between algorithms (F1-Score and Accuracy)

In [None]:
matplotlib.rcParams.update({'font.size': 8})
fig, ax = plt.subplots(2, figsize=(8,8))
plt.suptitle("F1-score and accuracy of the different algorithms")
width = 0.25  # the width of the bars

def setSubplots1(input, axis_x):
    ax[axis_x].set_ylim(0.8, 1.0)
    algorithms = []
    f1s = []
    accs = []
    for element in results:
        algorithms.append(element)
        f1s.append(round(results[element][input]["f1_score"],4))
        accs.append(round(results[element][input]["accuracy"],4))

    x = np.arange(len(algorithms))  # the label locations

    rects1 = ax[axis_x].bar(x - width/2, f1s, width, label='F1-Score')
    rects2 = ax[axis_x].bar(x + width/2, accs, width, label='Accuracy')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax[axis_x].set_ylabel('Scores')
    ax[axis_x].set_title(input + ' input')
    ax[axis_x].set_xticks(x, algorithms)
    ax[axis_x].legend()

    ax[axis_x].bar_label(rects1, padding=3)
    ax[axis_x].bar_label(rects2, padding=3)

setSubplots1("text", 0)
setSubplots1("title", 1)

fig.tight_layout()

plt.show()

### Compare scores between different inputs (text and title)

In [None]:
scores = ["f1_score", "accuracy", "precision", "recall"]
x = np.arange(len(scores))  # the label locations
width = 0.25  # the width of the bars
fig, ax = plt.subplots(2, 2, figsize=(16,8))
plt.suptitle("Scores for tet and title input")

def setSubplot(algorithm, axis_x, axis_y):
    title_values = []
    text_values = []
    for element in results[algorithm]["text"]:
        text_values.append(round(results[algorithm]["text"][element],4))
    for element in results[algorithm]["title"]:
        title_values.append(round(results[algorithm]["title"][element],4))
    title_values.pop()
    text_values.pop()
    title_values.pop(0)
    text_values.pop(0)

    rects1 = ax[axis_x][axis_y].bar(x - width/2, text_values, width, label='Text')
    rects2 = ax[axis_x][axis_y].bar(x + width/2, title_values, width, label='Title')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax[axis_x][axis_y].set_ylabel('Scores')
    ax[axis_x][axis_y].set_title(algorithm)
    ax[axis_x][axis_y].set_xticks(x, scores)
    ax[axis_x][axis_y].legend()

    ax[axis_x][axis_y].bar_label(rects1, padding=3)
    ax[axis_x][axis_y].bar_label(rects2, padding=3)
    ax[axis_x][axis_y].set_ylim(0.7, 1.0)

setSubplot("simple_linear_model", 0, 0)
setSubplot("simple_convolution_model", 0, 1)
setSubplot("support_vector_machine", 1, 0)
setSubplot("random_forest", 1, 1)

fig.tight_layout()

plt.show()