### AIR Project

## Imports and specific settings

In [144]:
import ast
import string

import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Word2Vec
import multiprocessing
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [145]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
print("running models with {}".format(device))

running models with cuda:0


## Loading of dataset and nan-removal

In [146]:
def pre_process():
    data = pd.read_csv('WELFake_Dataset.csv', index_col=0)
    print(data.shape)
    # display(data[:300])
    for i,x in data.iterrows():
        if len(str(x["text"])) <= 10:
            data.loc[i, "text"] = np.nan
        if len(str(x["title"])) <= 10:
            data.loc[i, "title"] = np.nan

    data.dropna(inplace=True)
    print(data.shape)
    data.reset_index(drop=True, inplace=True)
    data.to_csv("data/data.csv")
    display(data[:300])

## Tokenization

In [147]:
def tokenize():
    stop = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    punc = [u'\u201c',u'\u201d',u'\u2018',u'\u2019',u'\u2024',u'\u2025',u'\u2026',u'\u2027']
    # print(punc)
    data = pd.read_csv('data/data.csv', index_col=0)
    titles = list()
    texts = list()
    for i, row in data.iterrows():
        title = str(row["title"])
        text = str(row["text"])
        t1 = ""
        for c in title:
            if not (c in string.punctuation or c in punc):
                t1 += c
            else:
                t1 += " "
        t2 = ""
        for c in text:
            if not (c in string.punctuation or c in punc):
                t2 += c
            else:
                t2 += " "
        title_tokens = nltk.tokenize.word_tokenize(t1)
        text_tokens = nltk.tokenize.word_tokenize(t2)
        # title_filtered = [w.lower() for w in title_tokens if not w.lower() in string.punctuation]
        # title_filtered = [w.lower() for w in title_filtered if not w.lower() in punc]
        title_filtered = [w.lower() for w in title_tokens if not w.lower() in stop]
        title_stemmed = [stemmer.stem(w) for w in title_filtered]
        # text_filtered = [w.lower() for w in text_tokens if not w.lower() in string.punctuation]
        # text_filtered = [w.lower() for w in text_filtered if not w.lower() in punc]
        text_filtered = [w.lower() for w in text_tokens if not w.lower() in stop]
        text_stemmed = [stemmer.stem(w) for w in text_filtered]
        # print(title_stemmed)
        # print(text_stemmed)
        titles.append(title_stemmed)
        texts.append(text_stemmed)
        if i % 5000 == 0:
            print(i)
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_cleaned = pd.DataFrame(data=d)
    # data_cleaned["title"] = titles
    # data_cleaned["text"] = texts
    data_cleaned.to_csv("data/data_token.csv")
# tokenize()

## Creating and Training Word2Vec Model

In [148]:
load_model_from_disc = True
w2v_model = None
data = pd.read_csv('data_tokenized/data_token.csv', index_col=0)
# for i, row in data.iterrows():
#     print(type(row["title"]))
#     print(row)
#     data.loc[i, "title"] = ast.literal_eval(row["title"])
#     data.loc[i, "text"] = ast.literal_eval(row["text"])
if load_model_from_disc:
    try:
        w2v_model = Word2Vec.load("word2vec.model")
    except:
        pass

if w2v_model is None or not load_model_from_disc:
    if load_model_from_disc:
        print("Could not load model from disc. Training model...")
    else:
        print("Loading from disc deactivated. Training model...")

    class MySentences(object):
        def __init__(self, data):
            self.data = data

        def __iter__(self):
            for doc in pd.concat([data["text"], data["title"]]): #change to "title" or combine both
                doc = ast.literal_eval(doc)
                yield doc

    sentences = MySentences(data)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores-1)

    w2v_model.build_vocab(sentences, progress_per=10000)
    t = time.time()
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=3, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))
    w2v_model.save("word2vec.model")
else:
    print("Model loaded from disc.")

Model loaded from disc.


In [149]:
# calculate similarity
w2v_model.wv.similarity("amazon", 'nazi')

-0.11904364

In [150]:
# calculate similarity
w2v_model.wv.similarity("obama", 'trump')

0.60530704

In [151]:
# find out which element doesn't match
w2v_model.wv.doesnt_match(['amazon', 'obama', 'trump'])

'amazon'

In [152]:
# Which word is to obama as georg is to bush?
w2v_model.wv.most_similar(positive=["obama", "georg"], negative=["bush"], topn=3)

[('barack', 0.607993483543396),
 ('presid', 0.4728606641292572),
 ('behest', 0.4636874198913574)]

In [153]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["obama"])

[('barack', 0.832769513130188),
 ('administr', 0.6584988832473755),
 ('presid', 0.6397863626480103),
 ('predecessor', 0.625878632068634),
 ('trump', 0.6053071022033691),
 ('bush', 0.557529628276825),
 ('outgo', 0.5535241961479187),
 ('undo', 0.5471165180206299),
 ('holdov', 0.5300476551055908),
 ('clinton', 0.5224902629852295)]

In [154]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["presid"])

[('barack', 0.7051833868026733),
 ('45th', 0.6659713387489319),
 ('successor', 0.6399291157722473),
 ('obama', 0.6397863626480103),
 ('administr', 0.6288126111030579),
 ('trump', 0.6117547750473022),
 ('donald', 0.6109979748725891),
 ('predecessor', 0.6096833348274231),
 ('pres', 0.594482958316803),
 ('presidenti', 0.5868942737579346)]

## Creating Doc2Vec
word2vec for each word with average over document

In [155]:
# creates w2v representation for all documents and titles
def doc2vec():
    titles = list()
    texts = list()
    start = time.time()
    for i, row in data.iterrows():
        vec_title = np.zeros(shape=w2v_model.vector_size)
        vec_text = np.zeros(shape=w2v_model.vector_size)
        tit = ast.literal_eval(row["title"])
        tex = ast.literal_eval(row["text"])
        tit_cnt = 0
        tex_cnt = 0
        for word in tit:
            try:
                vec_title += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tit_cnt += 1
                pass
        for word in tex:
            try:
                vec_text += w2v_model.wv[word]
            except KeyError:
                # print("Didn't find word {}".format(word))
                tex_cnt += 1
                pass
        if len(tit) > tit_cnt:
            vec_title /= (len(tit) - tit_cnt)
        if len(tex) > tex_cnt:
            vec_text /= (len(tex) - tex_cnt)
        titles.append(vec_title.tolist())
        texts.append(vec_text.tolist())
        if i % 5000 == 0:
            print("[{}/{}] - {:.1f}s".format(i, len(data.index), time.time() - start))
    end = time.time()
    print("creating doc2vec took {:.1f}s".format(end - start))
    d = {"title":titles, "text":texts, "label":data["label"]}
    data_w2v = pd.DataFrame(data=d)
    data_w2v.to_csv("data/data_w2v.csv")
    display(data_w2v[:100])
# doc2vec()

## Train-Test-split and Dataloader Creation

In [156]:
params = {'batch_size': 10,
          'shuffle': True,
          'num_workers': 0}
max_epochs = 100


data_d2v = pd.read_csv("data/data_w2v.csv", index_col=0)
titles = list()
texts = list()
# print("interpreting data")
# for i, row in data_d2v.iterrows():
#     titles.append(ast.literal_eval(row["title"]))
#     texts.append(ast.literal_eval(row["text"]))
# print("done interpreting data")
# data_d2v["title"] = titles
# data_d2v["text"] = texts


data_d2v_title = data_d2v[["title", "label"]].copy()
data_d2v_text = data_d2v[["text", "label"]].copy()
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(data_d2v_title["title"], data_d2v_title["label"], test_size=0.15, random_state=42, shuffle=True)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(data_d2v_text["text"], data_d2v_text["label"], test_size=0.15, random_state=42, shuffle=True)

X_train_title.reset_index(drop=True, inplace=True)
X_test_title.reset_index(drop=True, inplace=True)
y_train_title.reset_index(drop=True, inplace=True)
y_test_title.reset_index(drop=True, inplace=True)
X_train_text.reset_index(drop=True, inplace=True)
X_test_text.reset_index(drop=True, inplace=True)
y_train_text.reset_index(drop=True, inplace=True)
y_test_text.reset_index(drop=True, inplace=True)


class data_set(Dataset):
    def __init__(self, X, y):
        super(Dataset, self).__init__()
        assert len(X.index) == len(y.index)
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X.index)

    def __getitem__(self, index):
        return ast.literal_eval(self.X[index]), self.y[index]

def collate_batch(batch):
    labels = list()
    texts = list()
    for (_text, _label) in batch:
        labels.append(_label)
        texts.append(_text)

    return torch.tensor(texts), torch.tensor(labels)

    # print(batch)

train_dataset_title = data_set(X_train_title, y_train_title)
test_dataset_title = data_set(X_test_title, y_test_title)
train_dataset_text = data_set(X_train_text, y_train_text)
test_dataset_text = data_set(X_test_text, y_test_text)

train_dataloader_title = DataLoader(train_dataset_title, **params, collate_fn=collate_batch)
test_dataloader_title = DataLoader(test_dataset_title, **params)
train_dataloader_text = DataLoader(train_dataset_text, **params)
test_dataloader_text = DataLoader(test_dataset_text, **params)

# for batch, (X, y) in enumerate(train_dataloader_title):
#     print(X)
#     print(X.shape)
#     print(y.shape)
#     break


bf
10
tensor([[-4.1686e-01,  3.5883e-01,  3.4760e-02, -6.3746e-02,  2.4935e-02,
         -9.5040e-02,  3.5321e-01,  1.0483e-01,  1.9505e-01,  2.8922e-01,
         -3.6285e-02, -3.2419e-01, -7.1020e-01,  1.8390e-01,  8.2401e-03,
         -6.8013e-01, -1.0447e-01,  1.6210e-01,  9.3961e-01, -7.0248e-01,
          2.8800e-01,  4.5711e-01, -2.7507e-01, -4.0530e-02, -1.2603e-01,
          5.4919e-01, -2.1448e-01, -3.0206e-01,  3.0307e-02,  1.1947e-02,
          4.3648e-01, -2.5218e-02, -2.8998e-01, -5.0695e-01,  9.8252e-02,
          2.4983e-01, -1.5599e-01, -9.4060e-01, -2.9541e-01, -1.6763e-01,
         -1.7932e-01,  1.3420e-01, -1.8323e-02,  1.3620e-01,  4.6321e-01,
         -4.4259e-01, -3.7899e-01, -5.4660e-01,  9.8054e-01,  1.1294e-01,
         -1.4967e-01, -3.6480e-03,  8.3331e-01, -3.5666e-02,  1.8833e-01,
         -1.1445e-01,  1.5629e-01,  2.3504e-01, -3.4132e-01, -1.0942e-02,
         -1.4456e-01, -4.8057e-02,  4.3837e-01, -9.8660e-03, -1.3837e-01,
          3.2350e-01, -2.3701e-0