In [33]:
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Word2Vec
import multiprocessing
from time import time
from gensim import  models
from gensim.test.utils import datapath

In [34]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def pre_process():
    data = pd.read_csv('WELFake_Dataset.csv', index_col=0)
    print(data.shape)
    # display(data[:300])
    for i,x in data.iterrows():
        if len(str(x["text"])) <= 10:
            data.loc[i, "text"] = np.nan
        if len(str(x["title"])) <= 10:
            data.loc[i, "title"] = np.nan

    data.dropna(inplace=True)
    print(data.shape)
    data.reset_index(drop=True, inplace=True)
    data.to_csv("data/data.csv")
    display(data[:300])

def tokenize():
    stop = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    punc = [u'\u201c',u'\u201d',u'\u2018',u'\u2019',u'\u2024',u'\u2025',u'\u2026',u'\u2027']
    # print(punc)
    data = pd.read_csv('data/data.csv', index_col=0)
    data_cleaned = data.copy()
    titles = list()
    texts = list()
    for i, row in data.iterrows():
        title = str(row["title"])
        text = str(row["text"])
        t1 = ""
        for c in title:
            if not (c in string.punctuation or c in punc):
                t1 += c
            else:
                t1 += " "
        t2 = ""
        for c in text:
            if not (c in string.punctuation or c in punc):
                t2 += c
            else:
                t2 += " "
        title_tokens = nltk.tokenize.word_tokenize(t1)
        text_tokens = nltk.tokenize.word_tokenize(t2)
        # title_filtered = [w.lower() for w in title_tokens if not w.lower() in string.punctuation]
        # title_filtered = [w.lower() for w in title_filtered if not w.lower() in punc]
        title_filtered = [w.lower() for w in title_tokens if not w.lower() in stop]
        title_stemmed = [stemmer.stem(w) for w in title_filtered]
        # text_filtered = [w.lower() for w in text_tokens if not w.lower() in string.punctuation]
        # text_filtered = [w.lower() for w in text_filtered if not w.lower() in punc]
        text_filtered = [w.lower() for w in text_tokens if not w.lower() in stop]
        text_stemmed = [stemmer.stem(w) for w in text_filtered]
        # print(title_stemmed)
        # print(text_stemmed)
        titles.append(title_stemmed)
        texts.append(text_stemmed)
        if i % 5000 == 0:
            print(i)
    data_cleaned["title"] = titles
    data_cleaned["text"] = texts
    data_cleaned.to_csv("data/data_token.csv")

In [35]:
load_model_from_disc = True

try:
    w2v_model = Word2Vec.load("word2vec.model")
except:
    w2v_model = None

if w2v_model is None or not load_model_from_disc:
    if load_model_from_disc:
        print("Could not load model from disc. Training model...")
    else:
        print("Loading from disc deactivated. Training model...")
    data = pd.read_csv('data_tokenized/data_token.csv', index_col=0, dtype=str)

    class MySentences(object):
        def __init__(self, data):
            self.data = data

        def __iter__(self):
            for doc in data["text"]: #change to "title" or combine both
                words = doc.split(",")
                doc = []
                for word in words:
                    doc.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
                yield doc

    sentences = MySentences(data)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores-1)

    w2v_model.build_vocab(sentences, progress_per=10000)
    t = time()
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=3, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    w2v_model.save("word2vec.model")
else:
    print("Model loaded from disc.")


Model loaded from disc.


In [36]:
# calculate similarity
w2v_model.wv.similarity("amazon", 'nazi')

0.026135037

In [37]:
# calculate similarity
w2v_model.wv.similarity("obama", 'trump')

0.61256015

In [38]:
# find out which element doesn't match
w2v_model.wv.doesnt_match(['amazon', 'obama', 'trump'])

'amazon'

In [39]:
# Which word is to obama as georg is to bush?
w2v_model.wv.most_similar(positive=["obama", "georg"], negative=["bush"], topn=3)

[('barack', 0.6386590600013733),
 ('presid', 0.5025283098220825),
 ('predecessor', 0.45077192783355713)]

In [40]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["obama"])

[('barack', 0.8123461604118347),
 ('predecessor', 0.6643136143684387),
 ('presid', 0.6526463627815247),
 ('administr', 0.6208781599998474),
 ('trump', 0.6125600934028625),
 ('undo', 0.5900983810424805),
 ('outgo', 0.5714931488037109),
 ('bush', 0.5544796586036682),
 ('rescind', 0.5267906785011292),
 ('holdov', 0.5253826379776001)]

In [41]:
# e.g. words most similar to obama
w2v_model.wv.most_similar(positive=["presid"])

[('barack', 0.6946936845779419),
 ('obama', 0.6526462435722351),
 ('45th', 0.6525393724441528),
 ('successor', 0.6197038888931274),
 ('predecessor', 0.614696741104126),
 ('administr', 0.6129357218742371),
 ('trump', 0.6105669736862183),
 ('outgo', 0.6059844493865967),
 ('donald', 0.5954946279525757),
 ('presidenti', 0.5723046064376831)]