In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import torch
import pickle
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from itertools import chain
from collections import Counter

In [2]:
# Load data
pickle_in = open("data_withMeasurements.pkl", "rb")
dataWIKI = pickle.load(pickle_in)

# Load data
pickle_in = open("dataBOW_withMeasurements.pkl", "rb")
dataBOW = pickle.load(pickle_in)

In [20]:
# Add data together
#data = dataWIKI | dataBOW
#data = dataWIKI
data = dataBOW

In [21]:
TotalValues = list(chain.from_iterable(data.values()))

ones = Counter(ones[0] for ones in TotalValues if ones[0] == 1)
zeros = Counter(ones[0] for ones in TotalValues if ones[0] == 0)

print('{0} values.'. format(len(TotalValues)))
print('{0} labels with 1 (true).'.format(ones[1]))
print('{0} labels with 0 (false).'.format(zeros[0]))

6890 values.
2511 labels with 1 (true).
4379 labels with 0 (false).


In [22]:
'''
# Species
len(data)
# Total values
Total = sum(len(v) for v in list(data.values()))
print(Total)
# Total Truths
TotalValues = [v for v in list(data.values())]
Truths = [[ones[0] for ones in Values if ones[0] == 1] for Values in TotalValues]
AmountofTruths = sum(len(x) for x in Truths)
# Total False
AmountofFalse = Total - AmountofTruths
print(AmountofTruths)
print(AmountofFalse)
'''

'\n# Species\nlen(data)\n# Total values\nTotal = sum(len(v) for v in list(data.values()))\nprint(Total)\n# Total Truths\nTotalValues = [v for v in list(data.values())]\nTruths = [[ones[0] for ones in Values if ones[0] == 1] for Values in TotalValues]\nAmountofTruths = sum(len(x) for x in Truths)\n# Total False\nAmountofFalse = Total - AmountofTruths\nprint(AmountofTruths)\nprint(AmountofFalse)\n'

In [23]:
# Extract values from the dict
data = list(chain.from_iterable(data.values()))
# Train test sequence
train = int(len(data) * 0.8)
test = len(data) - train

In [24]:
# Split the dataset
trainset = data[0:train]
testset = data[train:]

In [25]:
len(trainset)

5512

In [26]:
# Basic English
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
def yield_tokens(data_iter):
    # Drop the label (label, text)
    for _, text in data_iter:
        yield tokenizer(text)

# Build a vocabulary        
vocab = build_vocab_from_iterator(yield_tokens(trainset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [27]:
# Create a pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
# -1 could be removed if data is loaded differently
label_pipeline = lambda x: int(x) 

In [28]:
# Testing
proc = torch.tensor(text_pipeline('the a bird birds, the bear are not equal.'), dtype=torch.int64)
print(proc)

tensor([  10,   13,  345,   40,    1,   10,  705,   25,   55, 4512,    2])


In [29]:
# Set device (CPU for macs)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    
    '''
    Convert sentences or paragrahs to integers by using
    the PyTorch Vocab(). The data is converted and 
    returned as a tensor. The offset of the words is 
    compared to the start of the sentence/paragraph.
    '''

    # Init lists
    label_list, text_list, offsets = [], [], [0]
    
    # Loop over the data
    for (_label, _text) in batch:
        # Append the labels to list
        label_list.append(label_pipeline(_label))
        # Process the text (singed 64), and convert to tensor
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        # Append the text to list 
        text_list.append(processed_text)
        # Append the offset (tensor size)   
        offsets.append(processed_text.size(0))
    
    # Convert the label list to a tensor
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # Cummulative sum the offsets (dim=0 == rowwise)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # Concatenate the text list
    text_list = torch.cat(text_list)
    
    # Return the values
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [30]:
train_iter = trainset
dataloader = DataLoader(train_iter, batch_size=16, shuffle=True, collate_fn=collate_batch)

In [31]:
next(iter(dataloader))

(tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]),
 tensor([551,  91,  71,  ...,  27, 113,   2]),
 tensor([   0,   12,   18,  114,  287,  384,  973, 1007, 1610, 1711, 1976, 2036,
         2039, 2059, 2086, 2111]))

In [51]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [52]:
train_iter = trainset
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [53]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [54]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 8 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = trainset, testset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/  655 batches | accuracy    0.871
-----------------------------------------------------------
| end of epoch   1 | time:  1.22s | valid accuracy    0.917 
-----------------------------------------------------------
| epoch   2 |   500/  655 batches | accuracy    0.935
-----------------------------------------------------------
| end of epoch   2 | time:  1.14s | valid accuracy    0.888 
-----------------------------------------------------------
| epoch   3 |   500/  655 batches | accuracy    0.951
-----------------------------------------------------------
| end of epoch   3 | time:  1.14s | valid accuracy    0.935 
-----------------------------------------------------------
| epoch   4 |   500/  655 batches | accuracy    0.953
-----------------------------------------------------------
| end of epoch   4 | time:  1.13s | valid accuracy    0.942 
-----------------------------------------------------------
| epoch   5 |   500/  655 batches | accuracy    0.956
------

In [55]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.987


In [19]:
label = {1: "a description or similar.",
         0: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

# Measurements
ex_text_str = 'They are 90 to 150 cm tall at the shoulder and can tower at an intimidating height of 8 feet when standing upright on their hind legs.'
model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

# Random
ex_text_str = 'Hi I am GIS student, this is a random sentence!'
model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

# Bird stuff
ex_text_str = 'The bill is long and orange.'
model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

# Something about bears
ex_text_str = '''Brown bears are often not fully brown. 
                They have long, thick fur, with a moderately long mane at the back of the neck which varies somewhat across the types. 
                In India, brown bears can be reddish with silver-tipped hairs, while in China brown bears are bicolored, 
                with a yellowish-brown or whitish collar across the neck, chest and shoulders.'''

model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

# Somehting about Robins
ex_text_str = '''The upperparts are brownish, or olive-tinged in British birds, and the belly whitish, while the legs and feet are brown. 
                The bill and eyes are black.'''

model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

# Random Difficult sentence
ex_text_str = '''While I am very tan from the sun, I can be pale within a few days.
                I have blonde hair'''

model = model.to("cpu")
print("This is %s" %label[predict(ex_text_str, text_pipeline)])

This is a description or similar.
This is something else.
This is a description or similar.
This is a description or similar.
This is a description or similar.
This is a description or similar.


In [None]:
# init dicts
ExtraTesting = []

# Removes references in text
ReferenceRemover = '\[\d*\]'

URL = 'https://en.wikipedia.org/wiki/Eurasian_wigeon'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
    
for Tags in soup.find_all('h2'):

    # Skip useless/empty stuff
    if Tags.span == None:
        continue

    # Set chapter variable    
    Chapter = Tags.span.attrs['id']

    # Check if the chapter is description (or similar)
    if Chapter == 'Characteristics'or \
       Chapter == 'Description' or \
       Chapter == 'Appearance':

        # Get the next sibling (text)
        for Text in Tags.find_next_siblings('p'):

            # Add description data to dict
            if Chapter in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(1, Sentence) for Sentence in SentenceList]

            # Add non description data to dict
            elif Chapter not in Text.find_previous_siblings('h2')[0].text.strip():
                # Remove source
                Paragraph = re.sub(ReferenceRemover, '', Text.text)
                # Split into Sentences
                SentenceList = Paragraph.split('. ')
                # Add to the dict
                ExtraTesting += [(0, Sentence) for Sentence in SentenceList]

In [None]:
label = {1: "a description or similar.",
         0: "something else."}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

for tests in ExtraTesting:
    ex_text_str = tests[1]
    
    model = model.to("cpu")

    print("This is %s" %label[predict(ex_text_str, text_pipeline)])
    print("Real value was {0}".format(tests[0]))
    print(tests[1])
    print('\n')