In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_mammals_of_Europe'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Find all wikiparts
Animals = soup.find_all('a')
# Create links 
AnimalsWikiPages = ['https://en.wikipedia.org/' + pages.get('href') for pages in Animals 
                       if pages.get('href') != None 
                       if pages.get('href').startswith('/wiki/')]
                       # Reduces the retrieved pages (does not work)
                       #if pages.span != None 
                       #if pages.span.attrs['class'][0] == 'tocnumber']

In [3]:
# init dicts
data = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

for WikiPage in AnimalsWikiPages[:]:
    
    # Open the page
    page = requests.get(WikiPage, timeout=5)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    for Tags in soup.find_all('h2'):

        # Skip useless/empty stuff
        if Tags.span == None:
            continue

        # Set chapter variable    
        Chapter = Tags.span.attrs['id']

        # Check if the chapter is description (or similar)
        if Chapter == 'Characteristics'or \
           Chapter == 'Description' or \
           Chapter == 'Appearance':

            # Get the next sibling (text)
            for Text in Tags.find_next_siblings('p'):

                # Add description data to dict
                if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                    # Remove source
                    Paragraph = re.sub(ReferenceRemover, '', Text.text)
                    # Split into Sentences
                    SentenceList = Paragraph.split('. ')
                    # Add to the dict
                    data += [(1, Sentence) for Sentence in SentenceList]

                # Add non description data to dict
                elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                    # Remove source
                    Paragraph = re.sub(ReferenceRemover, '', Text.text)
                    # Split into Sentences
                    SentenceList = Paragraph.split('. ')
                    # Add to the dict
                    data += [(2, Sentence) for Sentence in SentenceList]

In [4]:
len(data)

18867

In [5]:
# Train test sequence
train = int(len(data) * 0.8)
test = len(data) - train

In [6]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchtext.data.functional import to_map_style_dataset

In [7]:
# Split the dataset
trainset = data[0:train]
testset = data[train:]

In [8]:
# Basic English
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Build a vocabulary        
vocab = build_vocab_from_iterator(yield_tokens(trainset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
# Create a pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [21]:
text_pipeline('the whale and the bear are not equal.')

[2, 77, 3, 2, 261, 10, 43, 3125, 7]

In [11]:
# Set device (CPU for macs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [12]:
train_iter = trainset
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [13]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [14]:
train_iter = trainset
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [15]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [16]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 2 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = trainset, testset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 7169 batches | accuracy    0.792
| epoch   1 |  1000/ 7169 batches | accuracy    0.842
| epoch   1 |  1500/ 7169 batches | accuracy    0.853
| epoch   1 |  2000/ 7169 batches | accuracy    0.868
| epoch   1 |  2500/ 7169 batches | accuracy    0.865
| epoch   1 |  3000/ 7169 batches | accuracy    0.906
| epoch   1 |  3500/ 7169 batches | accuracy    0.879
| epoch   1 |  4000/ 7169 batches | accuracy    0.895
| epoch   1 |  4500/ 7169 batches | accuracy    0.871
| epoch   1 |  5000/ 7169 batches | accuracy    0.890
| epoch   1 |  5500/ 7169 batches | accuracy    0.885
| epoch   1 |  6000/ 7169 batches | accuracy    0.889
| epoch   1 |  6500/ 7169 batches | accuracy    0.907
| epoch   1 |  7000/ 7169 batches | accuracy    0.890
-----------------------------------------------------------
| end of epoch   1 | time:  4.95s | valid accuracy    0.882 
-----------------------------------------------------------
| epoch   2 |   500/ 7169 batches | accuracy    0.912
| epoch  

In [17]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.899


In [29]:
label = {1: "a description or similar.",
         2: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = """
No pochard has a metallic coloured speculum, something that is characteristic of other ducks.\n'
"""    
    
model = model.to("cpu")

print("This is %s" %label[predict(ex_text_str, text_pipeline)])

This is a description or similar.


In [40]:
# init dicts
ExtraTesting = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

URL = 'https://en.wikipedia.org/wiki/Common_goldeneye'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
    
for Tags in soup.find_all('h2'):

    # Skip useless/empty stuff
    if Tags.span == None:
        continue

    # Set chapter variable    
    Chapter = Tags.span.attrs['id']

    # Check if the chapter is description (or similar)
    if Chapter == 'Characteristics'or \
       Chapter == 'Description' or \
       Chapter == 'Appearance':

        # Get the next sibling (text)
        for Text in Tags.find_next_siblings('p'):

            # Add description data to dict
            if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(1, Sentence) for Sentence in SentenceList]

            # Add non description data to dict
            elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(0, Sentence) for Sentence in SentenceList]

In [41]:
ExtraTesting[0][1]

'Adult males ranges from 45–51\xa0cm (18–20\xa0in) and weigh approximately 1,000\xa0g (2.2\xa0lb), while females range from 40–50\xa0cm (16–20\xa0in) and weigh approximately 800\xa0g (1.8\xa0lb)'

In [42]:
label = {1: "a description or similar.",
         2: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

for tests in ExtraTesting[0:100]:
    ex_text_str = tests[1]
    
    model = model.to("cpu")

    print("This is %s" %label[predict(ex_text_str, text_pipeline)])
    print("Real value was {0}".format(tests[0]))

This is a description or similar.
Real value was 1
This is a description or similar.
Real value was 1
This is a description or similar.
Real value was 1
This is a description or similar.
Real value was 1
This is a description or similar.
Real value was 1
This is a description or similar.
Real value was 1
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is a description or similar.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real value was 0
This is something else.
Real