# 1. Process txt data

In [1]:
from gensim.corpora import WikiCorpus
from gensim.corpora.wikicorpus import *
import os
import re
from datetime import datetime

In [2]:
wiki_tokenize_re = re.compile('^([A-Za-z]|\.|\!|\?\,\'){1,}')
wiki_replace_re = re.compile("([^A-Za-z\.\!\,\?]|'s)")
wiki_addspace_re = re.compile('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )')
wiki_s_re = re.compile("'")

def tokenize(content):
    content = content.lower()
    content = wiki_replace_re.sub(' ', content)
    content = wiki_addspace_re.sub(' ', content)
    #conent = wiki_s_re.sub(" '", content)
    # override original method in wikicorpus.py
    return [token for token in content.split()]

def process_article(args):
   # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid


class MyWikiCorpus(WikiCorpus):
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)

    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length
        
tokenize('What is this __thing_?')

['what', 'is', 'this', 'thing', '?']

In [3]:
wiki_corpus = MyWikiCorpus('enwiki-20220201-pages-articles-multistream.xml.bz2', dictionary={})
text_num = 0

if not os.path.isfile('wiki_text.txt') or 0:
    with open('wiki_text.txt', 'w', encoding='utf-8') as f:
        start_time = datetime.now()
        for text in wiki_corpus.get_texts():
            f.write(str(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n'))
            text_num += 1
            if text_num % 1000 == 0:
                use_time = datetime.now() - start_time
                est_time = use_time*5176019/(text_num+1) - use_time
                print('{}/5405081 articles processed. Time Used: {} Time EST.: {}'.format(text_num, use_time.__str__().split(".")[0], est_time.__str__().split(".")[0]), end='\r')

        print('{} articles processed.'.format(text_num))

In [4]:
# os.system('tail -n 1 wiki_text.txt')

# 2. Train word to vector model

In [5]:
from gensim.models import word2vec

In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
if not os.path.isfile('word2vec.model') or 0: 
    sg = 0
    window_size = 10
    vector_size = 100
    min_count = 100
    workers = 16
    epochs = 5
    batch_words = 500000

    train_data = word2vec.LineSentence('wiki_text.txt')
    model =  word2vec.Word2Vec(
        train_data,
        min_count=min_count,
        size=vector_size,
        workers=workers,
        window=window_size,
        iter= epochs,
        sg=sg,
        batch_words=batch_words
    )

    model.save('word2vec.model')

In [8]:
model = word2vec.Word2Vec.load('word2vec.model')

2022-02-24 09:49:30,058 : INFO : loading Word2Vec object from word2vec.model
2022-02-24 09:49:30,505 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2022-02-24 09:49:30,506 : INFO : loading vectors from word2vec.model.wv.vectors.npy with mmap=None
2022-02-24 09:49:30,528 : INFO : setting ignored attribute vectors_norm to None
2022-02-24 09:49:30,529 : INFO : loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
2022-02-24 09:49:30,529 : INFO : loading trainables recursively from word2vec.model.trainables.* with mmap=None
2022-02-24 09:49:30,530 : INFO : loading syn1neg from word2vec.model.trainables.syn1neg.npy with mmap=None
2022-02-24 09:49:30,552 : INFO : setting ignored attribute cum_table to None
2022-02-24 09:49:30,553 : INFO : loaded word2vec.model


### Take a look at the model

In [61]:
for item in model.wv.most_similar('communism'):
    print(item)

('fascism', 0.8815176486968994)
('nazism', 0.860171377658844)
('socialism', 0.8568580746650696)
('anarchism', 0.8435341119766235)
('capitalism', 0.8299001455307007)
('imperialism', 0.8271135091781616)
('stalinism', 0.8257947564125061)
('militarism', 0.8055943250656128)
('zionism', 0.803575873374939)
('nationalism', 0.8000373840332031)


In [10]:
model.wv['cyberpunk'].shape

(100,)

In [11]:
print('Total distinct words:',len(model.wv.vocab))

Total distinct words: 365540


# 2 Load train data

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import random

import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence

2022-02-24 09:50:38,512 : INFO : Note: NumExpr detected 20 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-02-24 09:50:38,512 : INFO : NumExpr defaulting to 8 threads.


### Process training data

In [30]:
line_length = 100
if not os.path.isfile('politic_processed.txt') or 0:
    words = []
    files = os.listdir('politic_texts')
    files = ['politic_texts/'+f for f in files if f.endswith(".txt")]
    print(files)
    for f in files:
        with open(f, 'r') as file:
            lines = file.readlines()
            for line in lines:
                words.extend(tokenize(line))
    print('Total words:', len(words))
    lines = []
    while len(words)>0:
        if len(words)>line_length:
            lines.append(words[:line_length])
            words = words[line_length:]
        else:
            lines.append(words)
            words = []
    print('Total sample:', len(lines))
    with open('politic_processed.txt', 'w') as file:
        for line in lines:
            file.write(' '.join(line) + '\n')

['politic_texts/politic1.txt', 'politic_texts/politic13.txt', 'politic_texts/politic18.txt', 'politic_texts/politic9.txt', 'politic_texts/politic16.txt', 'politic_texts/politic19.txt', 'politic_texts/politic11.txt', 'politic_texts/politic14.txt', 'politic_texts/politic5.txt', 'politic_texts/politic6.txt', 'politic_texts/politic4.txt', 'politic_texts/politic15.txt', 'politic_texts/politic8.txt', 'politic_texts/politic12.txt', 'politic_texts/politic2.txt', 'politic_texts/politic10.txt', 'politic_texts/politic.txt', 'politic_texts/politic17.txt', 'politic_texts/politic7.txt', 'politic_texts/politic3.txt']
Total words: 613223
Total sample: 6133


In [32]:
with open('politic_processed.txt', 'r') as file:
    lines = file.readlines()
train_s = pd.Series(lines)
train_s = train_s.apply(lambda x:x.split())
train_s.head()

0    [communism, from, latin, communis, ,, common, ...
1    [,, revolutionary, spontaneity, ,, and, worker...
2    [this, system, there, are, two, major, social,...
3    [in, turn, ,, establish, social, ownership, of...
4    [s, ., criticism, of, communism, can, be, divi...
dtype: object

In [33]:
train_vec = train_s.apply(lambda x: [model.wv[word] if word in model.wv.vocab else np.zeros(model.vector_size, dtype=np.float32) for word in x])
train_vec[0][0].shape

(100,)

In [34]:
class wiki_politic_set(Dataset):
    def __init__(self, sentences):
        self.data = sentences
        
    def __getitem__(self, index):
        x = np.asarray(self.data[index])
        y = np.append(x[1:],np.zeros(x[0].shape))
        x = torch.tensor(x)
        y = torch.tensor(y).reshape(x.shape)
        return x, y
    
    def __len__(self):
        return len(self.data)


def collate_fn(batch):
    text_list, predict_list = [], []
    for text, predict in batch:
        text_list.append(text)
        predict_list.append(predict)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    predict_list = pad_sequence(predict_list, batch_first=True, padding_value=0)
    return text_list.float(), predict_list.float()

In [49]:
batch_size = 1024
wikiset = wiki_politic_set(train_vec)

wiki_data = DataLoader(wikiset, batch_size=batch_size, collate_fn=collate_fn, sampler=RandomSampler(wikiset, replacement=True, num_samples=batch_size))

### checkout the dataset

In [50]:
for text, pred in wiki_data:
    print(text.shape, pred.shape, '\n')
    text = text[0,:,:]
    pred = pred[0,:,:]
    for i in range(20):
        word = text[i].reshape(1,100)
        word = np.asarray(word)
        word, _ = model.wv.most_similar(positive = word, topn = 1)[0]
        print(word, end=' ')
    print()
    for i in range(20):
        word = pred[i].reshape(1,100)
        word = np.asarray(word)
        word, _ = model.wv.most_similar(positive = word.reshape(1,100), topn = 1)[0]
        print(word, end=' ')
    print()

torch.Size([1024, 100, 100]) torch.Size([1024, 100, 100]) 

of sanitary conditions and the sale of noxious drugs , the preservation of a just system of distribution these , 
sanitary conditions and the sale of noxious drugs , the preservation of a just system of distribution these , among 


# 3. Train RNN

In [51]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

device(type='cuda')

In [52]:
import GRU
from importlib import reload

In [53]:
GRU = reload(GRU)

In [54]:
gru_net = GRU.GRUNet().to(device)
print('Total parameters:', sum(p.numel() for p in gru_net.parameters()))

Total parameters: 5434980


In [55]:
learning_rate = .01
optimizer = Adam(gru_net.parameters(), lr = learning_rate)
epoches = 10000
train_loss_history = []
batch_num = len(wiki_data)

In [56]:
load_checkpoint = False
if load_checkpoint:
    checkpoint = torch.load('gru.checkpoint')
    gru_net.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    loss = checkpoint['epoch']

In [58]:
start_time = datetime.now()
for epoch in range(epoches):
    total_loss_train = []
    
    for x, pred in wiki_data: 
        x = x.to(device)
        pred = pred.to(device)
        h = gru_net.init_hidden(x.shape[0]).to(device)
        optimizer.zero_grad()
        output, h = gru_net(x, h)
        loss = F.mse_loss(output, pred)
        total_loss_train.append(loss.clone().detach().cpu())
        loss.backward()
        optimizer.step()
        loss.detach();output.detach();x.detach();pred.detach();
        
    use_time = datetime.now() - start_time

    est_time = use_time*epoches/(epoch+1) - use_time
    print('Epoch {:3}/{:3} Time Used: {} Time EST.: {}'.format(
        epoch+1, epoches, use_time.__str__().split(".")[0], est_time.__str__().split(".")[0]), end='\r')
    if (epoch+1)%500==0:
        torch.save({
        'epoch': epoch,
        'model_state_dict': gru_net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'time_used': use_time,
        }, 'gru.checkpoint.'+str(epoch+1))
        
        torch.save({
        'epoch': epoch,
        'model_state_dict': gru_net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'time_used': use_time,
        }, 'gru.checkpoint')
        
        train_loss = np.sqrt(np.mean(total_loss_train))
        train_loss_history.append(train_loss)
        print(' '*50, end='\r');
        print('Epoch {}: Train loss = {:.5E} Time: {}'.format(epoch+1, train_loss, use_time))
    

Epoch 500: Train loss = 1.90405E+00 Time: 0:01:14.749593
Epoch 1000: Train loss = 1.88765E+00 Time: 0:02:30.286432
Epoch 1500: Train loss = 1.85220E+00 Time: 0:03:46.855830
Epoch 2000: Train loss = 1.83416E+00 Time: 0:05:03.385470
Epoch 2500: Train loss = 1.78540E+00 Time: 0:06:20.076761
Epoch 3000: Train loss = 1.73598E+00 Time: 0:07:36.633291
Epoch 3500: Train loss = 1.71617E+00 Time: 0:08:53.219149
Epoch 4000: Train loss = 1.69063E+00 Time: 0:10:11.849992
Epoch 4500: Train loss = 1.65828E+00 Time: 0:11:28.396448
Epoch 5000: Train loss = 1.67557E+00 Time: 0:12:45.963534
Epoch 5500: Train loss = 1.67496E+00 Time: 0:14:03.863957
Epoch 6000: Train loss = 1.77757E+00 Time: 0:15:22.477537
Epoch 6186/10000 Time Used: 0:15:52 Time EST.: 0:09:47

KeyboardInterrupt: 