In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_mammals_of_Europe'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Find all wikiparts
Animals = soup.find_all('a')
# Create links 
AnimalsWikiPages = ['https://en.wikipedia.org/' + pages.get('href') for pages in Animals 
                       if pages.get('href') != None 
                       if pages.get('href').startswith('/wiki/')]
                       # Reduces the retrieved pages (does not work)
                       #if pages.span != None 
                       #if pages.span.attrs['class'][0] == 'tocnumber']

In [3]:
# init dicts
data = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

for WikiPage in AnimalsWikiPages[:]:
    
    # Open the page
    page = requests.get(WikiPage, timeout=5)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    for Tags in soup.find_all('h2'):

        # Skip useless/empty stuff
        if Tags.span == None:
            continue

        # Set chapter variable    
        Chapter = Tags.span.attrs['id']

        # Check if the chapter is description (or similar)
        if Chapter == 'Characteristics'or \
           Chapter == 'Description' or \
           Chapter == 'Appearance':

            # Get the next sibling (text)
            for Text in Tags.find_next_siblings('p'):

                # Add description data to dict
                if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                    # Remove source
                    Paragraph = re.sub(ReferenceRemover, '', Text.text)
                    # Split into Sentences
                    SentenceList = Paragraph.split('. ')
                    # Add to the dict
                    data += [(1, Sentence) for Sentence in SentenceList]

                # Add non description data to dict
                elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                    # Remove source
                    Paragraph = re.sub(ReferenceRemover, '', Text.text)
                    # Split into Sentences
                    SentenceList = Paragraph.split('. ')
                    # Add to the dict
                    data += [(2, Sentence) for Sentence in SentenceList]

In [101]:
len(data)

18867

In [105]:
TruthData = [i for i in data if i[0] == 1]
len(TruthData)

3573

In [5]:
# Train test sequence
train = int(len(data) * 0.8)
test = len(data) - train

In [107]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn

In [7]:
# Split the dataset
trainset = data[0:train]
testset = data[train:]

In [8]:
# Basic English
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
def yield_tokens(data_iter):
    # Drop the label (label, text)
    for _, text in data_iter:
        yield tokenizer(text)

# Build a vocabulary        
vocab = build_vocab_from_iterator(yield_tokens(trainset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
# Create a pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
# -1 could be removed if data is loaded differently
label_pipeline = lambda x: int(x) - 1

In [125]:
# Testing
proc = torch.tensor(text_pipeline('the a whale and, the bear are not equal.'), dtype=torch.int64)
print(proc)

tensor([   2,    8,   77,    3,    1,    2,  261,   10,   43, 3125,    7])


In [112]:
# Set device (CPU for macs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    
    # Init lists
    label_list, text_list, offsets = [], [], [0]
    
    # Loop over the data
    for (_label, _text) in batch:
        # Append the labels to list
        label_list.append(label_pipeline(_label))
        # Process the text (singed 64), and convert to tensor
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        # Append the text to list 
        text_list.append(processed_text)
        # Append the offset (tensor size)   
        offsets.append(processed_text.size(0))
    
    # Convert the label list to a tensor
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # Cummulative sum the offsets (dim=0 == rowwise)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # Concatenate the text list
    text_list = torch.cat(text_list)
    
    # Return the values
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [140]:
train_iter = trainset
dataloader = DataLoader(train_iter, batch_size=16, shuffle=True, collate_fn=collate_batch)

In [141]:
next(iter(dataloader))

(tensor([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1]),
 tensor([   19,     9,   713,     4,   953,  2420,     5,  3705,   540,   476,
          1826,     1,    14,   355,     1,  4193,  1826,  2383,     8,  1812,
             5,   582,     1,  2253,  1080,    14,     2,  1379,     5,     8,
           331,   178,    77,   304,     4,  1768,     1,     2,  5983,   312,
            22,    35,  4111,    16,   133,    20,     8,  1433,     5,  2043,
          6708,     1, 13992,     4,     8,   360,   463,  5772, 12640,  3936,
          5675,     4,   944,  1486,    23,  2790,     2,   348,   501,  1417,
          2343,     6,  4109,    42,  1193,  2118, 10718,     4, 14427,     1,
            18,   564,     1,   291,    99,    22,    35,  2895,   177,  2838,
             1,     3,    22,  3113,     4,  1066, 13754,     4, 16034,     2,
           684,  1986,     9,   118,  2638,     4,  4113,     3,  3176,    40,
           164,   292,     4,   222,     2,   232,   661,    92,     9, 

In [134]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [135]:
train_iter = trainset
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [136]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [137]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 16 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = trainset, testset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/  897 batches | accuracy    0.848
-----------------------------------------------------------
| end of epoch   1 | time:  1.30s | valid accuracy    0.881 
-----------------------------------------------------------
| epoch   2 |   500/  897 batches | accuracy    0.901
-----------------------------------------------------------
| end of epoch   2 | time:  1.24s | valid accuracy    0.914 
-----------------------------------------------------------
| epoch   3 |   500/  897 batches | accuracy    0.922
-----------------------------------------------------------
| end of epoch   3 | time:  1.24s | valid accuracy    0.889 
-----------------------------------------------------------
| epoch   4 |   500/  897 batches | accuracy    0.935
-----------------------------------------------------------
| end of epoch   4 | time:  1.22s | valid accuracy    0.914 
-----------------------------------------------------------
| epoch   5 |   500/  897 batches | accuracy    0.941
------

In [138]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.900


In [142]:
label = {1: "a description or similar.",
         2: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = """
One of the largest of living carnivores, grizzly bears are 1 to 2.8 meters in length from head to rump and\
their tails are 65 to 210 mm long. They are 90 to 150 cm tall at the shoulder and can tower at an intimidating\
height of 8 feet when standing upright on their hind legs. They range in weight from 80 to more than 600 kg. \
On average, adult males are 8 to 10% larger than females. Ursus arctos is largest along the the coast of southern\
Alaska and on nearby islands where males average 389 kg and females average 207 kg, though some males have been weighed \
at as much as 780 kg. Distance between the canines is from 6 to 8 cm. Size rapidly declines to the north and east, with\
individuals in southwestern Yukon weighing only 140 kg on average. Fur is usually dark brown, but varies from cream to almost black.\
Individuals in the Rocky Mountains have long hairs along the shoulders and back which are frosted with white, giving a grizzled appearance,\
hence the common name grizzly bear in that region. Brown bears are extremely strong and have good endurance; they\
can kill a cow with one blow, outrun a horse, outswim an Olympian, and drag a dead elk uphill. (Wilson and Ruff, 1999)"""    
model = model.to("cpu")

print("This is %s" %label[predict(ex_text_str, text_pipeline)])

This is a description or similar.


In [65]:
# init dicts
ExtraTesting = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

URL = 'https://en.wikipedia.org/wiki/Eurasian_wigeon'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
    
for Tags in soup.find_all('h2'):

    # Skip useless/empty stuff
    if Tags.span == None:
        continue

    # Set chapter variable    
    Chapter = Tags.span.attrs['id']

    # Check if the chapter is description (or similar)
    if Chapter == 'Characteristics'or \
       Chapter == 'Description' or \
       Chapter == 'Appearance':

        # Get the next sibling (text)
        for Text in Tags.find_next_siblings('p'):

            # Add description data to dict
            if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(1, Sentence) for Sentence in SentenceList]

            # Add non description data to dict
            elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(0, Sentence) for Sentence in SentenceList]

In [111]:
label = {1: "a description or similar.",
         2: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

for tests in ExtraTesting:
    ex_text_str = tests[1]
    
    model = model.to("cpu")

    print("This is %s" %label[predict(ex_text_str, text_pipeline)])
    print("Real value was {0}".format(tests[0]))
    print(tests[1])
    print('\n')

This is a description or similar.
Real value was 1
This dabbling duck is 42–52 cm (17–20 in) long with a 71–80 cm (28–31 in) wingspan, and a weight of 500–1,073 g (1.102–2.366 lb)


This is a description or similar.
Real value was 1
The breeding male has grey flanks and back, with a black rear end, a dark green speculum and a brilliant white patch on upper wings, obvious in flight or at rest


This is a description or similar.
Real value was 1
It has a pink breast, white belly, and a chestnut head with a creamy crown


This is something else.
Real value was 1
In non-breeding (eclipse) plumage, the drake looks more like the female


This is a description or similar.
Real value was 1
The female is light brown, with plumage much like a female American wigeon


This is a description or similar.
Real value was 1
It can be distinguished from most other ducks, apart from American wigeon, on shape


This is a description or similar.
Real value was 1
However, that species has a paler head and w