In [14]:
# SyferText imports
import syfertext
from syfertext.pipeline import SimpleTagger

# Import useful utility functions for this tutoria
from utils import download_dataset

# PySyft and PyTorch import
import syft as sy
from syft.generic.string import String
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim

import os
import codecs
import re

download dataset

In [7]:
# The URL template to all dataset files
url_template = 'https://github.com/Nilanshrajput/ner_dataset/blob/master/conll2003/%s'

# File names to be downloaded from the using the URL template above
files = ['eng.train','eng.testa','eng.testb']

# Construct the list of urls
urls = [url_template % file for file in files]


# The dataset name and its root folder
dataset_name = 'conll2003'
root_path = './conll2003'

# Create the dataset folder if it is not already there
if not os.path.exists('./conll2003'):
    os.mkdir('./conll2003')

# Start downloading
download_dataset(dataset_name = dataset_name, 
                 urls = urls, 
                 root_path = root_path
                )


Preparing to download dataset: `conll2003` ...


conll2003: 4.86MB [00:00, 6.48MB/s]                            


In [8]:
# Create a torch hook for PySyft
hook = sy.TorchHook(torch)

# Create some PySyft workers
me = hook.local_worker # This is the worker representing the deep learning company
bob = sy.VirtualWorker(hook, id = 'bob') # Bob owns the first dataset
alice = sy.VirtualWorker(hook, id = 'alice') # Alice owns the second dataset

crypto_provider = sy.VirtualWorker(hook, id = 'crypto_provider') # provides encryption primitive for SMPC



Pre-process

In [9]:
def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)

def load_sentences(path, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """

    
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [15]:
train_sentences = load_sentences('conll2003/eng.train',zeros=True)
test_sentences = load_sentences('conll2003/eng.testb', zeros=True)
dev_sentences = load_sentences('conll2003/eng.testa', zeros=True)

In [46]:
train_sentences[1]

[['Peter', 'NNP', 'I-NP', 'I-PER'], ['Blackburn', 'NNP', 'I-NP', 'I-PER']]

In [20]:
for s in train_sentences:
        
        str_words = [w[0] for w in s]
        print(str_words)
        print(" ".join(str_words))
        break
        #the last word in every sentence 

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
EU rejects German call to boycott British lamb .


In [50]:


def tag_mapping(sentences):
    """
    Create a mapping of tags(item to ID / ID to item)  
    """
    tags = set()
    
    for s in sentences:
        tags = tags.union({word[-1] for word in s})
    
    tag_to_id, id_to_tag = {},{}
    for id,tag in enumerate(tags):
        tag_to_id[tag]=id
        id_to_tag[id]=tag
    print("Found %i unique named entity tags" % len(tags))
    return list(tags),tag_to_id, id_to_tag

In [51]:
tags, tag_to_id, id_to_tag = tag_mapping(train_sentences+test_sentences+dev_sentences)

Found 8 unique named entity tags


In [54]:
def lower_case(x,lower=False):
    if lower:
        return x.lower()  
    else:
        return x

def prepare_dataset(sentences, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing: words and tag indexes
    """
    
    data = []
    tag
    for s in sentences:
        
        str_words = [w[0] for w in s]
        #the last word in every sentence represetn ner tag
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'string': " ".join(str_words),
            'tags': tags,
        })
        
    return data

train_data = prepare_dataset(
    train_sentences, tag_to_id, lower=True)
dev_data = prepare_dataset(
    dev_sentences, tag_to_id, lower=True)
test_data = prepare_dataset(
    test_sentences, tag_to_id, lower=True)
print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data)))

14041 / 3250 / 3453 sentences in train / dev / test.
