# AIR Project
## Group 17

We are using a dataset which includes non-fake as well as fake news (labeled dataset).
https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification


## Imports and specific settings

In [556]:
import ast
import os.path
import sys
from collections import OrderedDict
import torch
from torch import nn
from torch.utils.data import DataLoader
from gensim.models import Word2Vec
import multiprocessing
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torchmetrics.classification import BinaryF1Score
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from rank_bm25 import BM25Okapi
import math
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier


In [557]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
print("running models with {}".format(device))

if not os.path.exists("data"):
    os.mkdir("data")

running models with cuda:0


## Loading of dataset and nan-removal
Uncomment function-call to redo preprocessing
Result is saved in data/data.csv

In [558]:
def pre_process():
    data = pd.read_csv('WELFake_Dataset.csv', index_col=0)
    # display(data[:300])
    for i,x in data.iterrows():
        if len(str(x["text"])) <= 10:
            data.loc[i, "text"] = np.nan
        if len(str(x["title"])) <= 10:
            data.loc[i, "title"] = np.nan

    data.dropna(inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.to_csv("data/data.csv")
    display(data[:300])

In [559]:
if not os.path.exists("data/data.csv"):
    pre_process()

## Tokenization
Uncomment function-call to redo tokenization for data/data.csv
Result is saved in data/data_token.csv

In [560]:
def tokenize():
    stop = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    punc = [u'\u201c',u'\u201d',u'\u2018',u'\u2019',u'\u2024',u'\u2025',u'\u2026',u'\u2027']
    # print(punc)
    data = pd.read_csv('data/data.csv', index_col=0)
    titles = list()
    texts = list()
    for i, row in data.iterrows():
        title = str(row["title"])
        text = str(row["text"])
        t1 = ""
        for c in title:
            if not (c in string.punctuation or c in punc):
                t1 += c
            else:
                t1 += " "
        t2 = ""
        for c in text:
            if not (c in string.punctuation or c in punc):
                t2 += c
            else:
                t2 += " "
        title_tokens = nltk.tokenize.word_tokenize(t1)
        text_tokens = nltk.tokenize.word_tokenize(t2)
        # title_filtered = [w.lower() for w in title_tokens if not w.lower() in string.punctuation]
        # title_filtered = [w.lower() for w in title_filtered if not w.lower() in punc]
        title_filtered = [w.lower() for w in title_tokens if not w.lower() in stop]
        title_stemmed = [stemmer.stem(w) for w in title_filtered]
        # text_filtered = [w.lower() for w in text_tokens if not w.lower() in string.punctuation]
        # text_filtered = [w.lower() for w in text_filtered if not w.lower() in punc]
        text_filtered = [w.lower() for w in text_tokens if not w.lower() in stop]
        text_stemmed = [stemmer.stem(w) for w in text_filtered]
        # print(title_stemmed)
        # print(text_stemmed)
        titles.append(title_stemmed)
        texts.append(text_stemmed)
        if i % 5000 == 0:
            print(i)
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_cleaned = pd.DataFrame(data=d)
    # data_cleaned["title"] = titles
    # data_cleaned["text"] = texts
    data_cleaned.to_csv("data/data_token.csv")


In [561]:
if not os.path.exists("data/data_token.csv"):
    tokenize()

## Bag of Words


In [562]:
def make_bow(data_path):
    data = pd.read_csv(data_path, index_col=0)
    bow = []
    bow_title = []
    bow_text = []
    bow_both = []
    for i, row in data.iterrows():
        words = row["title"].split(",")
        title = []
        for word in words:
            title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        words = row["text"].split(",")
        text = []
        for word in words:
            text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        dic_title = {}
        dic_text = {}
        dic_both = {}
        for word in title:
            if word in dic_title:
                dic_title[word] = dic_title[word] + 1
            else:
                dic_title[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        for word in text:
            if word in dic_text:
                dic_text[word] = dic_text[word] + 1
            else:
                dic_text[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        bow_text.append(dic_text)
        bow_title.append(dic_title)
        bow_both.append(dic_both)
    bow.append(bow_title)
    bow.append(bow_text)
    bow.append(bow_both)
    return bow

In [563]:
# bow = [bow_title[],bow_text[],bow_both[]]
# bow = make_bow('data_tokenized/data_token.csv') # was uncommented

## TFIDF with Cosine

In [564]:
def tf(bow_):
    tf_ = []
    for dic in bow_:
        max_ = 0
        for i in dic:
            if dic[i] > max_:
                max_ = dic[i]
        tf_dic = {}
        for word in dic:
            tf_dic[word] = dic[word]/max_
        tf_.append(tf_dic)
    return tf_

def idf(bow_):
    df_ = {}
    for dic in bow_:
        for word in dic:
            if word in df_:
                df_[word] += 1
            else:
                df_[word] = 1
    idf_ = {}
    for word in df_:
        idf_[word] = math.log10(len(bow)/df_[word])
    return idf_

def tf_idf(bow_):
    tf_ = tf(bow_)
    idf_ = idf(bow_)
    tfidf = []
    for dic in tf_:
        tfidf_dic = {}
        for word in dic:
            tfidf_dic[word] = dic[word] * idf_[word]
        tfidf.append(tfidf_dic)
    return tfidf

def cosineSim(dic_a, dic_b):
    for word in dic_a:
        if word not in dic_b:
            dic_b[word] = 0
    for word in dic_b:
        if word not in dic_a:
            dic_a[word] = 0
    dot, sum_a, sum_b = 0,0,0
    for word in dic_a:
        a = dic_a[word]
        b = dic_b[word]
        dot += (a*b)
        sum_a += math.pow(a,2)
        sum_b += math.pow(b,2)
    sqrt_sum_a = math.sqrt(sum_a)
    sqrt_sum_b = math.sqrt(sum_b)
    return dot / (sqrt_sum_a * sqrt_sum_b)

def tfidf_cosine_ranking(word_, bow_):
    tfidf_all = tf_idf(bow_)
    list_query = [{word_: 1}]
    tfidf_query = tf_idf(list_query)[0]
    article_index = []
    cosSim = []
    cos_index = 0
    for a in tfidf_all:
        article_index.append(cos_index)
        cosSim.append(cosineSim(a,tfidf_query))
        cos_index += 1
    return pd.DataFrame({'article': article_index ,'value': cosSim }).sort_values(by=['value'], ascending=False)



In [565]:
#cos_rank = tfidf_cosine_ranking('obama',bow[2])
#print(cos_rank.head(5))

## bm25

In [566]:
def bm25_ranking(query_,index_):
    data = pd.read_csv('data_tokenized/data_token.csv', index_col=0)
    corpus = []
    title = []
    text = []
    both = []
    for i, row in data.iterrows():
            words = row["title"].split(",")
            for word in words:
                title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
            words = row["text"].split(",")
            for word in words:
                text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
    if index_ == 0:
        corpus = title
    elif index_ == 1:
        corpus = text
    else:
        for i in title:
            both.append(title + text)
        corpus = both

    print("Starting bm25")
    bm25 = BM25Okapi(corpus)
    bm25_scores = bm25.get_scores(query_.split(" "))

    article_index = []
    bm25_index = 0
    for a in bm25_scores:
        article_index.append(bm25_index)
        bm25_index += 1
    return pd.DataFrame({'article': article_index ,'value': bm25_scores }).sort_values(by=['value'], ascending=False)


In [567]:
# bm25_rank = bm25_ranking('sunday',bow[2]) # was uncommented
# print(bm25_rank.head(5)) # was uncommented

## Creating and Training Word2Vec Model

In [568]:
load_model_from_disc = True
w2v_model = None
data = pd.read_csv('data/data_token.csv', index_col=0)
# for i, row in data.iterrows():
#     print(type(row["title"]))
#     print(row)
#     data.loc[i, "title"] = ast.literal_eval(row["title"])
#     data.loc[i, "text"] = ast.literal_eval(row["text"])
if load_model_from_disc:
    try:
        w2v_model = Word2Vec.load("word2vec.model")
    except:
        pass

if w2v_model is None or not load_model_from_disc:
    if load_model_from_disc:
        print("Could not load model from disc. Training model...")
    else:
        print("Loading from disc deactivated. Training model...")

    class MySentences(object):
        def __init__(self, data):
            self.data = data

        def __iter__(self):
            for doc in pd.concat([data["text"], data["title"]]): #change to "title" or combine both
                doc = ast.literal_eval(doc)
                yield doc

    sentences = MySentences(data)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores-1)

    w2v_model.build_vocab(sentences, progress_per=10000)
    t = time.time()
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=3, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))
    w2v_model.save("word2vec.model")
else:
    print("Model loaded from disc.")

Model loaded from disc.


In [569]:
# calculate similarity
w2v_model.wv.similarity("amazon", 'nazi')

-0.11904364

In [570]:
# calculate similarity
w2v_model.wv.similarity("obama", 'trump')

0.60530704

In [571]:
# find out which element doesn't match
w2v_model.wv.doesnt_match(['amazon', 'obama', 'trump'])

'amazon'

In [572]:
# Which word is to obama as georg is to bush?
w2v_model.wv.most_similar(positive=["obama", "georg"], negative=["bush"], topn=3)

[('barack', 0.607993483543396),
 ('presid', 0.4728606641292572),
 ('behest', 0.4636874198913574)]

In [573]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["obama"])

[('barack', 0.832769513130188),
 ('administr', 0.6584988832473755),
 ('presid', 0.6397863626480103),
 ('predecessor', 0.625878632068634),
 ('trump', 0.6053071022033691),
 ('bush', 0.557529628276825),
 ('outgo', 0.5535241961479187),
 ('undo', 0.5471165180206299),
 ('holdov', 0.5300476551055908),
 ('clinton', 0.5224902629852295)]

In [574]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["presid"])

[('barack', 0.7051833868026733),
 ('45th', 0.6659713387489319),
 ('successor', 0.6399291157722473),
 ('obama', 0.6397863626480103),
 ('administr', 0.6288126111030579),
 ('trump', 0.6117547750473022),
 ('donald', 0.6109979748725891),
 ('predecessor', 0.6096833348274231),
 ('pres', 0.594482958316803),
 ('presidenti', 0.5868942737579346)]

## Creating Doc2Vec
word2vec for each word with average over document

In [575]:
# creates w2v representation for all documents and titles
def doc2vec():
    titles = list()
    texts = list()
    start = time.time()
    for i, row in data.iterrows():
        vec_title = np.zeros(shape=w2v_model.vector_size)
        vec_text = np.zeros(shape=w2v_model.vector_size)
        tit = ast.literal_eval(row["title"])
        tex = ast.literal_eval(row["text"])
        tit_cnt = 0
        tex_cnt = 0
        for word in tit:
            try:
                vec_title += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tit_cnt += 1
                pass
        for word in tex:
            try:
                vec_text += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tex_cnt += 1
                pass
        if len(tit) > tit_cnt:
            vec_title /= (len(tit) - tit_cnt)
        if len(tex) > tex_cnt:
            vec_text /= (len(tex) - tex_cnt)
        titles.append(vec_title.tolist())
        texts.append(vec_text.tolist())
        if i % 5000 == 0:
            print("[{}/{}] - {:.1f}s".format(i, len(data.index), time.time() - start))
    end = time.time()
    print("creating doc2vec took {:.1f}s".format(end - start))
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_w2v = pd.DataFrame(data=d)
    data_w2v.to_pickle("data/data_w2v.pkl")
    display(data_w2v[:100])

In [576]:
if not os.path.exists("data/data_w2v.pkl"):
    doc2vec()

## Train-Test-split and Dataloader Creation

In [577]:
def collate_batch(batch):
    labels = list()
    texts = list()
    for (_text, _label) in batch:
        labels.append(_label)
        texts.append(_text)

    return torch.tensor(texts), torch.tensor(labels)

In [578]:
batch_size = 100
params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 0,
          'collate_fn': collate_batch,
          'drop_last': True}


data_d2v = pd.read_pickle("data/data_w2v.pkl")
titles = list()
texts = list()
# print("interpreting data")
# for i, row in data_d2v.iterrows():
#     titles.append(ast.literal_eval(row["title"]))
#     texts.append(ast.literal_eval(row["text"]))
# print("done interpreting data")
# data_d2v["title"] = titles
# data_d2v["text"] = texts

data_d2v_title = data_d2v[["title", "label"]].copy()
data_d2v_text = data_d2v[["text", "label"]].copy()
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(data_d2v_title["title"], data_d2v_title["label"], test_size=0.15, random_state=42, shuffle=True)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(data_d2v_text["text"], data_d2v_text["label"], test_size=0.15, random_state=42, shuffle=True)

X_train_title.reset_index(drop=True, inplace=True)
X_test_title.reset_index(drop=True, inplace=True)
y_train_title.reset_index(drop=True, inplace=True)
y_test_title.reset_index(drop=True, inplace=True)
X_train_text.reset_index(drop=True, inplace=True)
X_test_text.reset_index(drop=True, inplace=True)
y_train_text.reset_index(drop=True, inplace=True)
y_test_text.reset_index(drop=True, inplace=True)


class data_set(Dataset):
    def __init__(self, X, y):
        super(Dataset, self).__init__()
        assert len(X.index) == len(y.index)
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X.index)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

train_dataset_title = data_set(X_train_title, y_train_title)
test_dataset_title = data_set(X_test_title, y_test_title)
train_dataset_text = data_set(X_train_text, y_train_text)
test_dataset_text = data_set(X_test_text, y_test_text)

train_dataloader_title = DataLoader(train_dataset_title, **params)
test_dataloader_title = DataLoader(test_dataset_title, **params)
train_dataloader_text = DataLoader(train_dataset_text, **params)
test_dataloader_text = DataLoader(test_dataset_text, **params)

for batch, (X, y) in enumerate(train_dataloader_title):
    # print(X)
    # print(X.shape)
    # print(y.shape)
    break

## Helper Functions

In [579]:
def add_metrics_to_log(log, metrics, y_true, y_pred, prefix=''):
    for metric in metrics:
        q = metric(y_true, y_pred)
        log[prefix + metric.__name__] = q
    return

def log_to_message(log, precision=4):
    fmt = "{0}: {1:." + str(precision) + "f}"
    return "    ".join(fmt.format(k, v) for k, v in log.items())

class ProgressBar(object):
    """Cheers @ajratner"""

    def __init__(self, n, length=40):
        # Protect against division by zero
        self.n      = max(1, n)
        self.nf     = float(n)
        self.length = length
        # Precalculate the i values that should trigger a write operation
        self.ticks = set([round(i/100.0 * n) for i in range(101)])
        self.ticks.add(n-1)
        self.bar(0)

    def bar(self, i, message=""):
        """Assumes i ranges through [0, n-1]"""
        if i in self.ticks:
            b = int(np.ceil(((i+1) / self.nf) * self.length))
            sys.stdout.write("\r[{0}{1}] {2}%\t{3}".format(
                "="*b, " "*(self.length-b), int(100*((i+1) / self.nf)), message
            ))
            sys.stdout.flush()

    def close(self, message=""):
        # Move the bar to 100% before closing
        self.bar(self.n-1)
        sys.stdout.write("\n{0}\n\n".format(message))
        sys.stdout.flush()

## Training Loop

In [580]:
epochs = 100

def train(dataloader, model, loss_fn, optimizer):
    start_time = time.time()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    total_loss = 0
    f1 = 0
    correct = 0
    f1_score = BinaryF1Score().to(device)
    pb = ProgressBar(size/batch_size)
    log = OrderedDict()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred.squeeze(), y.float())
        total_loss += loss.item()
        f1 += f1_score(pred.squeeze(), y)
        correct += (pred.squeeze().int() == y).float().sum()
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        log['loss'] = float(loss) / (batch + 1)
        log['f1'] = float(f1) / (batch + 1)
        log['accuracy'] = correct / ((batch + 1) * batch_size)
        log['time'] = time.time() - start_time
        pb.bar(batch, log_to_message(log))
    pb.close(log_to_message(log))
    return total_loss, (f1/num_batches).item(), (correct/(num_batches*batch_size)).item(), time.time()-start_time

## Simple Model for testing of training loop

In [581]:
class SimpleLinearModel(nn.Module):
    def __init__(self):
        super(SimpleLinearModel, self).__init__()
        self.fc1 = nn.Linear(100, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, vec):
        x = self.relu(self.fc1(vec))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return torch.sigmoid(self.fc4(x))

model = SimpleLinearModel().to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

train(train_dataloader_title, model, loss_fn, optimizer)

## Next steps:

Save tfidf in csv file @Freddy
Use tfidf as input for Dataloader creation (in comparison to w2v as input) @Freddy
Design new models, e.g.: using convolution/decision-trees #TODO @everybody who wants to
Presentation/Visualization of data and results @everybody who wants to

In [582]:
class SimpleConvolutionModel(nn.Module):
    def __init__(self):
        super(SimpleConvolutionModel, self).__init__()
        self.fc1 = nn.Linear(100, 100)
        self.conv = nn.Conv2d(in_channels=100, out_channels=100, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=1, stride=1)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.conv(x.unsqueeze(1))
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.squeeze()
        x = torch.sigmoid(self.fc2(x))
        return x


model = SimpleConvolutionModel()

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
train(train_dataloader_title, model, loss_fn, optimizer)

## Support Vector Machine

Using a support vector machine (SVM) to label between fake and real news.
First we are using the text as input afterwards just the titles.

In [583]:
def svg(X_train_data, y_train_data, X_test_data, y_test_data):
    X_train = X_train_data.tolist()
    X_test = X_test_data.tolist()

    clf = SVC(kernel='linear')
    print("fitting...")
    clf.fit(X_train,y_train_data)
    print("predicting...")
    y_pred = clf.predict(X_test)
    return accuracy_score(y_test_data, y_pred), f1_score(y_test_data,y_pred), precision_score(y_test_data, y_pred), recall_score(y_test_data, y_pred)


In [584]:
accuracy_text, f1_score_text, precision_text, recall_text = svg(X_train_text, y_train_text, X_test_text, y_test_text)
print("Accuracy for text: {:.2f}%\nF1 score for text: {:.2f}%\nPrecision score for text: {:.2f}%\nRecall score for text: {:.2f}%".format(accuracy_text*100, f1_score_text*100, precision_text*100, recall_text*100))


fitting...


KeyboardInterrupt: 

In [None]:
accuracy_title, f1_score_title, precision_title, recall_title = svg(X_train_title, y_train_title, X_test_title, y_test_title)
print("Accuracy for title: {:.2f}%\nF1 score for title: {:.2f}%\nPrecision score for title: {:.2f}%\nRecall score for title: {:.2f}%".format(accuracy_title*100, f1_score_title*100, precision_title*100, recall_title*100))


## Random Forest Algorithm

Using the random forest algorithm (RF) to label between fake and real news.
First we are using the text as input afterwards just the titles.

In [None]:
def random_forest(X_train, X_test, y_train, y_test, name, crit='gini'):
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 42, verbose=0, n_jobs=-1, criterion=crit)
    rf.fit(X_train.tolist(), y_train.tolist())
    y_pred = rf.predict(X_test.tolist())
    print("accuracy for random forest with {} criterion {}: {:.2f}%".format(crit, name, accuracy_score(y_test.tolist(), y_pred)*100))
    print("f1 score for random forest with {} criterion{}: {:.2f}%".format(crit, name, f1_score(y_test.tolist(), y_pred)*100))
    print("precision score random forest with {} criterion for {}: {:.2f}%".format(crit, name, precision_score(y_test.tolist(), y_pred)*100))
    print("recall score for random forest with {} criterion {}: {:.2f}%".format(crit, name, recall_score(y_test.tolist(), y_pred)*100))


random_forest(X_train_text, X_test_text, y_train_text, y_test_text, "w2v text")
random_forest(X_train_title, X_test_title, y_train_title, y_test_title, "w2v title")