In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import os
import time 
import glob
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [5]:
articles_path = './bbc-news-summary/BBC News Summary/News Articles'
summaries_path = './bbc-news-summary/BBC News Summary/Summaries'
categories_list = ['politics', 'sport', 'tech', 'entertainment', 'business']
# categories_list = ['tech']

In [6]:
def read_files_from_folders(articles_path, summaries_path, categories_list=['tech', 'sport'], encoding = "ISO-8859-1"):
    articles = []
    summaries = []
    categories = []
    for category in categories_list:
        article_paths = glob.glob(os.path.join(articles_path, category, '*.txt'), recursive=True)
        summary_paths = glob.glob(os.path.join(summaries_path, category, '*.txt'), recursive=True)
        
        if len(article_paths) != len(summary_paths):
            print('number of files is not equal')
            return
        for i in range(len(article_paths)):
            categories.append(category)
            with open(article_paths[i], mode='r', encoding = encoding) as file:
                articles.append(file.read())
            
            with open(summary_paths[i], mode='r', encoding = encoding) as file:
                summaries.append(file.read())
    return articles, summaries, categories

In [7]:
articles, summaries, categories = read_files_from_folders(articles_path, summaries_path, categories_list)
df = pd.DataFrame({'articles':articles, 'summaries':summaries, 'categories':categories})

In [8]:
df = df[['articles', 'summaries']]
df = df.dropna()
train_df, test_df = train_test_split(df, test_size=0.1)

In [9]:
df

Unnamed: 0,articles,summaries
0,Labour plans maternity pay rise\n\nMaternity p...,She said her party would boost maternity pay i...
1,Watchdog probes e-mail deletions\n\nThe inform...,All e-mails are subject to the freedom of info...
2,Hewitt decries 'career sexism'\n\nPlans to ext...,Ms Hewitt also announced a new drive to help w...
3,Labour chooses Manchester\n\nThe Labour Party ...,The Labour Party will hold its 2006 autumn con...
4,Brown ally rejects Budget spree\n\nChancellor ...,"But Mr Balls, a prospective Labour MP, said he..."
...,...,...
2220,Trial begins of Spain's top banker\n\nThe tria...,Both executives helped Mr Botin orchestrate Sp...
2221,UK economy ends year with spurt\n\nThe UK econ...,"Simon Rubinsohn, chief economist at Gerrard, s..."
2222,HealthSouth ex-boss goes on trial\n\nThe forme...,Several former HealthSouth employees have alre...
2223,Euro firms miss out on optimism\n\nMore than 9...,"Possibly as a result, the worry about low-cost..."


In [10]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lumin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from nltk.tokenize import RegexpTokenizer

sentences = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

for article in df['articles']:
    for sent in sent_tokenize(article):
        tokens = tokenizer.tokenize(sent)
        s = ' '.join(tokens)
        print(s)
        sentences.append(s)
        break
        
# for summary in df['summaries']:
#     sentences.extend(sent_tokenize(summary))
    
len(sentences)

Labour plans maternity pay rise Maternity pay for new mothers is to rise by as part of new proposals announced by the Trade and Industry Secretary Patricia Hewitt
Watchdog probes e mail deletions The information commissioner says he is urgently asking for details of Cabinet Office orders telling staff to delete e mails more than three months old
Hewitt decries career sexism Plans to extend paid maternity leave beyond six months should be prominent in Labour s election manifesto the Trade and Industry Secretary has said
Labour chooses Manchester The Labour Party will hold its autumn conference in Manchester and not Blackpool it has been confirmed
Brown ally rejects Budget spree Chancellor Gordon Brown s closest ally has denied suggestions there will be a Budget giveaway on March
Errors doomed first Dome sale The initial attempt to sell the Millennium Dome failed due to a catalogue of errors a report by the government s finance watchdog says
Fox attacks Blair s Tory lies Tony Blair lied 

2225

In [33]:
train_sents, test_sents = train_test_split(sentences, test_size=0.05)
print(len(train_sents), len(test_sents))

43745 2303


In [34]:
print(train_sents[0: 10])

['But the subsequent months have not been easy for Tevez.', 'Jerry Yang, who remains the firm\'s "Chief Yahoo", is proud of what the company has achieved.Yahoo, one of the net\'s most iconic companies, is celebrating its 10th anniversary this week.The web portal has undergone remarkable change since it was set up by Stanford University students David Filo and Jerry Yang in a campus trailer.', "I don't go into that game.", 'But 3G will not have everything its own way.', '"The conditions for euro entry are unchanged by this new decision about the stability and growth pact," Mr. Brown said.The chancellor said that the EU\'s planned changes in the growth and stability pact - designed to ensure that countries in the euro zone do not borrow too much - would force Britain to run a budget surplus of 1% over the economic cycle.But, speaking to the Treasury Select Committee, Gordon Brown said that the new stability pact rules were not part of a binding Treaty and could be changed again the futur

In [35]:
with open("bbc.train.txt", 'w') as file:
    for sent in train_sents:
        file.write(sent)
        file.write('\n')

In [36]:
with open("bbc.valid.txt", 'w') as file:
    for sent in test_sents:
        file.write(sent)
        file.write('\n')

In [1]:
import os
import json
import torch
import argparse

from model import SentenceVAE
from utils import to_var, idx2word, interpolate

In [2]:
with open('bbc_full/bbc.vocab.json', 'r') as file:
    vocab = json.load(file)

w2i, i2w = vocab['w2i'], vocab['i2w']

model = SentenceVAE(
    vocab_size=len(w2i),
    sos_idx=w2i['<sos>'],
    eos_idx=w2i['<eos>'],
    pad_idx=w2i['<pad>'],
    unk_idx=w2i['<unk>'],
    max_sequence_length=50,
    embedding_size=300,
    rnn_type='gru',
    hidden_size=256,
    word_dropout=0,
    embedding_dropout=0.5,
    latent_size=16,
    num_layers=1,
    bidirectional=False
    )

checkpoint = "bin/2024-Jun-07-00-34-15/E19.pytorch"

if not os.path.exists(checkpoint):
    raise FileNotFoundError(checkpoint)

model.load_state_dict(torch.load(checkpoint))
print("Model loaded from %s" % checkpoint)

if torch.cuda.is_available():
    model = model.cuda()

Model loaded from bin/2024-Jun-07-00-34-15/E19.pytorch


In [3]:
model.eval()

samples, z = model.inference(n=10)
print('----------SAMPLES----------')
print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

z1 = torch.randn([16]).numpy()
z2 = torch.randn([16]).numpy()
z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
samples, _ = model.inference(z=z)
print('-------INTERPOLATION-------')
print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

  sequence_idx = torch.arange(0, batch_size, out=self.tensor()).long()  # all idx of batch


----------SAMPLES----------
" the problem is that it is a good product , " said mr <unk> , chief executive of wippit , which monitors the consumer electronics show . <eos>
the european union ( ebu ) is set to grow in the uk . <eos>
" we have to make sure that we are seeing their own brand and more profitable services are not going to be more cost-effective . " <eos>
but the teamwork is not just to be kidding . <eos>
natasha bedingfield and pj harvey are shortlisted for the final stage of the night in the final three years . <eos>
he was not convinced . <eos>
the prime minister ratcheted up his side of the party in iraq and the city of the country . <eos>
the attorney general said that it was a criminal offence to ensure that the company had been designed to ensure that the system was " a criminal offence " . <eos>
however , the paris-based of the world's biggest carmaker maruti was overshadowed for a sixth try . <eos>
the research recommended the document partly based on experiments of

In [13]:
import os
import json
import time
import torch
import argparse
import numpy as np
from multiprocessing import cpu_count
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader
from collections import OrderedDict, defaultdict

from ptb import PTB
from utils import to_var, idx2word, expierment_name
from model import SentenceVAE

In [15]:
splits = ['train', 'valid']

datasets = OrderedDict()
for split in splits:
    datasets[split] = PTB(
        data_dir="bbc_full",
        split=split,
        create_data=False,
        max_sequence_length=60,
        min_occ=1
    )

In [22]:
split = 'valid'
data_loader = DataLoader(
    dataset=datasets[split],
    batch_size=10,
    shuffle= split == 'train',
    num_workers=cpu_count(),
    pin_memory=torch.cuda.is_available()
)

In [23]:
model.eval()

SentenceVAE(
  (embedding): Embedding(26946, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (decoder_rnn): GRU(300, 256, batch_first=True)
  (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=26946, bias=True)
)

In [24]:
batch = next(iter(data_loader))
batch

{'input': tensor([[    2,   721,  1258,    59,   724,   715,    48,   725,    94,   726,
             47,   727,   669,    36,   720,   728,    48,   729,    17,    46,
             47,    29,    59,   724,  9179,    57,   365,   730,     1,   730,
           5334,     5,  1488,    29,   302,  1893,    58,    59,   724,   715,
            445,   721,    29,   302,  8372,    58,     5,   724,    17,  6717,
            189,   859,   730,   731,    57,    59,  2649,    58,   737, 13297],
         [    2,     4,  1331,  2389,  2390,   219,    21,   179,   268,   179,
            444,  3823,    17,   262,     8,  2051,    59,  3261,   136,    66,
             14,    21,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [    2, 14196,   867

In [28]:
logp, mean, logv, z = model(batch['input'].cuda(), batch['length'].cuda())

In [37]:
_, logits = torch.topk(logp, 1, dim=-1)

In [42]:
logits.reshape(10,60).shape

torch.Size([10, 60])

In [45]:
print(*idx2word(batch['input'], i2w=i2w, pad_idx=w2i['<pad>']), sep='\n\n')

<sos> yukos says a us court was entitled to declare it bankrupt before its yugansk unit was sold , since it has a us subsidiary and local bank <unk> bank maintains the case has no place in a us court because yukos has no assets in the us , apart from two bank accounts and a house in houston owned

<sos> but told bbc news : " as far as i'm concerned , we have drawn a line under that . "

<sos> davos itself is in deep frost .

<sos> " i think it's the first productive thing we've had from bt , " he <unk> meet the growing demand for greater bandwidth , bt said it would begin trials in april with a view to launching <unk> services nationally from the autumn .

<sos> the announcement about internet explorer was made by bill gates , microsoft chairman and chief software architect , during a keynote speech at the rsa security conference currently being held in san francisco .

<sos> " the betamax principles stand as the magna carta for the technology industry and are responsible for the explo

In [46]:
print(*idx2word(logits, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n\n')

yukos has it moscow auction in entitled to declare it bankrupt in yugansk yugansk unit yuganskneftegas acquired in yugansk yugansk was bought troubled subsidiary in yuganskneftegas yugansk accounts russian accounts yukos russian in bought jobs in yukos wide-ranging court . yukos is bought assets of yukos us . assets from yukos russian accounts , bank russian of houston . by

" he bbc radio : " i a as i concerned that and have to up consensus with immense is <eos> <eos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>

schools is is a a and . <eos> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <un