In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import torch
import pickle
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from itertools import chain
from collections import Counter

In [2]:
# Load data
pickle_in = open("dataBOW.pkl", "rb")
data = pickle.load(pickle_in)

In [6]:
TotalValues = list(chain.from_iterable(data.values()))
print('{0} values'. format(len(TotalValues)))

print(Counter(ones[0] for ones in TotalValues if ones[0] == 1))
print(Counter(ones[0] for ones in TotalValues if ones[0] == 0))

124 values
Counter({1: 32})
Counter({0: 92})


In [None]:
'''
# Species
len(data)
# Total values
Total = sum(len(v) for v in list(data.values()))
print(Total)
# Total Truths
TotalValues = [v for v in list(data.values())]
Truths = [[ones[0] for ones in Values if ones[0] == 1] for Values in TotalValues]
AmountofTruths = sum(len(x) for x in Truths)
# Total False
AmountofFalse = Total - AmountofTruths
print(AmountofTruths)
print(AmountofFalse)
'''

In [7]:
# Extract values from the dict
data = list(chain.from_iterable(data.values()))
# Train test sequence
train = int(len(data) * 0.8)
test = len(data) - train

In [8]:
# Split the dataset
trainset = data[0:train]
testset = data[train:]

In [9]:
len(trainset)

99

In [10]:
# Basic English
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
def yield_tokens(data_iter):
    # Drop the label (label, text)
    for _, text in data_iter:
        yield tokenizer(text)

# Build a vocabulary        
vocab = build_vocab_from_iterator(yield_tokens(trainset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [11]:
# Create a pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
# -1 could be removed if data is loaded differently
label_pipeline = lambda x: int(x) 

In [12]:
# Testing
proc = torch.tensor(text_pipeline('the a whale and, the bear are not equal.'), dtype=torch.int64)
print(proc)

tensor([ 6,  7,  0,  3,  2,  6,  0, 20, 51,  0,  1])


In [13]:
# Set device (CPU for macs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    
    '''
    Convert sentences or paragrahs to integers by using
    the PyTorch Vocab(). The data is converted and 
    returned as a tensor. The offset of the words is 
    compared to the start of the sentence/paragraph.
    '''
    
    # Init lists
    label_list, text_list, offsets = [], [], [0]
    
    # Loop over the data
    for (_label, _text) in batch:
        # Append the labels to list
        label_list.append(label_pipeline(_label))
        # Process the text (singed 64), and convert to tensor
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        # Append the text to list 
        text_list.append(processed_text)
        # Append the offset (tensor size)   
        offsets.append(processed_text.size(0))
    
    # Convert the label list to a tensor
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # Cummulative sum the offsets (dim=0 == rowwise)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # Concatenate the text list
    text_list = torch.cat(text_list)
    
    # Return the values
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [14]:
train_iter = trainset
dataloader = DataLoader(train_iter, batch_size=16, shuffle=True, collate_fn=collate_batch)

In [15]:
next(iter(dataloader))

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]),
 tensor([530,   1,   7,  ..., 774,   1, 915]),
 tensor([   0,    7,  318,  333,  434,  476,  625,  739,  772,  798,  909, 1102,
         1109, 1157, 1191, 1215]))

In [16]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [17]:
train_iter = trainset
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [18]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [22]:
# Hyperparameters
EPOCHS = 8 # epoch
LR = 5  # learning rate
BATCH_SIZE = 2 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = trainset, testset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  0.06s | valid accuracy    0.800 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.06s | valid accuracy    0.800 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.06s | valid accuracy    0.800 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  0.04s | valid accuracy    0.800 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  0.05s | valid accuracy    1.000 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  0.05s |

In [23]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.560


In [21]:
label = {1: "a description or similar.",
         0: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

ex_text_str = """
One of the largest of living carnivores, grizzly bears are 1 to 2.8 meters in length from head to rump and\
their tails are 65 to 210 mm long. They are 90 to 150 cm tall at the shoulder and can tower at an intimidating\
height of 8 feet when standing upright on their hind legs. They range in weight from 80 to more than 600 kg. \
On average, adult males are 8 to 10% larger than females. Ursus arctos is largest along the the coast of southern\
Alaska and on nearby islands where males average 389 kg and females average 207 kg, though some males have been weighed \
at as much as 780 kg. Distance between the canines is from 6 to 8 cm. Size rapidly declines to the north and east, with\
individuals in southwestern Yukon weighing only 140 kg on average. Fur is usually dark brown, but varies from cream to almost black.\
Individuals in the Rocky Mountains have long hairs along the shoulders and back which are frosted with white, giving a grizzled appearance,\
hence the common name grizzly bear in that region. Brown bears are extremely strong and have good endurance; they\
can kill a cow with one blow, outrun a horse, outswim an Olympian, and drag a dead elk uphill. (Wilson and Ruff, 1999)"""    
model = model.to("cpu")

print("This is %s" %label[predict(ex_text_str, text_pipeline)])

This is something else.


In [None]:
# init dicts
ExtraTesting = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

URL = 'https://en.wikipedia.org/wiki/Eurasian_wigeon'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
    
for Tags in soup.find_all('h2'):

    # Skip useless/empty stuff
    if Tags.span == None:
        continue

    # Set chapter variable    
    Chapter = Tags.span.attrs['id']

    # Check if the chapter is description (or similar)
    if Chapter == 'Characteristics'or \
       Chapter == 'Description' or \
       Chapter == 'Appearance':

        # Get the next sibling (text)
        for Text in Tags.find_next_siblings('p'):

            # Add description data to dict
            if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(1, Sentence) for Sentence in SentenceList]

            # Add non description data to dict
            elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(0, Sentence) for Sentence in SentenceList]

In [None]:
label = {1: "a description or similar.",
         0: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

for tests in ExtraTesting:
    ex_text_str = tests[1]
    
    model = model.to("cpu")

    print("This is %s" %label[predict(ex_text_str, text_pipeline)])
    print("Real value was {0}".format(tests[0]))
    print(tests[1])
    print('\n')