In [1]:
%cd /content/drive/MyDrive/Stat Software/Project

/content/drive/MyDrive/Stat Software/Project


## Imports and Directories

In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# command line
import plac
import argparse

# data conversion
import csv
import json
import logging
import sys
import pickle

# sklearn
from sklearn.model_selection import train_test_split

# spacy
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

# misc
import random
import warnings
from tqdm.auto import tqdm
from pprint import pprint

In [3]:
# define directories
FINAL_DATA_DIR = os.path.join(os.getcwd(), 'final-data')
TRAIN_DATA = os.path.join(FINAL_DATA_DIR, 'ner-spacy-train.csv')
TEST_DATA = os.path.join(FINAL_DATA_DIR, 'ner-spacy-test.csv')

## Read Data and Conversion

In [4]:
df_train = pd.read_csv(TRAIN_DATA, header=None)
df_test = pd.read_csv(TEST_DATA, header=None)

In [5]:
df_train.shape, df_test.shape

((875422, 2), (173153, 2))

In [6]:
df_train.iloc[:, 1].value_counts(normalize=True)

O        0.847069
B-geo    0.035926
B-tim    0.019307
B-org    0.019126
I-per    0.016485
B-per    0.016228
I-org    0.015883
B-gpe    0.014979
I-geo    0.007165
I-tim    0.006198
B-art    0.000374
B-eve    0.000305
I-art    0.000292
I-eve    0.000249
B-nat    0.000188
I-gpe    0.000180
I-nat    0.000047
Name: 1, dtype: float64

In [7]:
df_test.iloc[:, 1].value_counts(normalize=True)

O        0.845293
B-geo    0.035772
B-tim    0.019815
B-org    0.019636
I-org    0.016633
I-per    0.016286
B-per    0.016078
B-gpe    0.015922
I-geo    0.006595
I-tim    0.006364
B-art    0.000433
I-art    0.000237
B-eve    0.000237
I-gpe    0.000231
B-nat    0.000208
I-eve    0.000202
I-nat    0.000058
Name: 1, dtype: float64

In [None]:
# convert train csv to tsv for converting into required spaCy format
csv.writer(open(FINAL_DATA_DIR+'/ner-train.tsv', 'w+'), delimiter='\t').writerows(csv.reader(open(TRAIN_DATA)))

In [8]:
# read train tsv
df_train = pd.read_csv(FINAL_DATA_DIR+'/ner-train.tsv', sep='\t', encoding='utf-8', header=None)

In [9]:
df_train.head()

Unnamed: 0,0,1
0,Thousands,O
1,of,O
2,demonstrators,O
3,have,O
4,marched,O


In [None]:
# Convert .tsv file to dataturks json format. 

def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

In [None]:
tsv_to_json_format(FINAL_DATA_DIR+'/ner-train.tsv', FINAL_DATA_DIR+'/ner-spacy-train.json','abc')

In [10]:
# check
with open(FINAL_DATA_DIR+'/ner-spacy-train.json') as f:
  data = f.read()

In [11]:
pprint(data[:500], width=120, indent=3)

('{"content": "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the '
 'withdrawal of British troops from that country ", "annotation": [{"label": ["B-geo"], "points": [{"text": "London", '
 '"start": 48, "end": 53}]}, {"label": ["B-geo"], "points": [{"text": "Iraq", "start": 77, "end": 80}]}, {"label": '
 '["B-gpe"], "points": [{"text": "British", "start": 111, "end": 117}]}]}\n'
 '{"content": "Families of soldiers killed in the conflict joined the protesters who carrie')


In [None]:
# Convert json file to spaCy format.


#@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str))

def main(input_file=None, output_file=None):
    try:
        training_data = []
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            if text != "":
                
              for annotation in data['annotation']:
                  point = annotation['points'][0]
                  labels = annotation['label']
                  if not isinstance(labels, list):
                      labels = [labels]

                  for label in labels:
                      entities.append((point['start'], point['end'] + 1 ,label))


              training_data.append((text, {"entities" : entities}))

        #print(training_data)

        with open(output_file, 'wb') as fp:
            pickle.dump(training_data, fp)

    except Exception as e:
        logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e))
        return None

if __name__ == '__main__':
    main(input_file=FINAL_DATA_DIR+'/ner-spacy-train.json', output_file=FINAL_DATA_DIR+'/ner-train-spacy.pkl')

In [12]:
# check
with open(FINAL_DATA_DIR+'/ner-train-spacy.pkl', 'rb') as f:
  data = pickle.load(f)

In [13]:
pprint(data[:2], indent=4)

[   (   'Thousands of demonstrators have marched through London to protest the '
        'war in Iraq and demand the withdrawal of British troops from that '
        'country ',
        {   'entities': [   (48, 54, 'B-geo'),
                            (77, 81, 'B-geo'),
                            (111, 118, 'B-gpe')]}),
    (   'Families of soldiers killed in the conflict joined the protesters who '
        'carried banners with such slogans as """" Bush Number One Terrorist '
        '"""" and """" Stop the Bombings ',
        {'entities': [(112, 116, 'B-per')]})]


In [14]:
data[-2:]

[('The blast caused a massive fire that firefighters were still trying to bring under control more than 12 hours after the explosion ',
  {'entities': []}),
 ('The refinery is owned by Imperial Sugar Company ',
  {'entities': [(25, 33, 'B-org'), (34, 39, 'I-org'), (40, 47, 'I-org')]})]

## Train Model

In [None]:
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""
# Loading training data 
with open(FINAL_DATA_DIR+'/ner-train-spacy.pkl', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

def main(model=None, new_model_name='spacy_ner_11212020', output_dir=None, n_iter=10):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in tqdm(range(n_iter)):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)
            """
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)     
            """
                
    # Test the trained model
    test_text = 'Gianni Infantino is the president of FIFA.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


if __name__ == '__main__':
    main(output_dir=FINAL_DATA_DIR)

Created blank 'en' model


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Losses {'ner': 72298.94619296206}
Losses {'ner': 56868.940236335344}
Losses {'ner': 52432.60245189024}
Losses {'ner': 49835.23851474888}
Losses {'ner': 48124.861916189926}
Losses {'ner': 46250.34075868717}
Losses {'ner': 44905.51279853745}
Losses {'ner': 43838.40086841049}
Losses {'ner': 42750.57473929}
Losses {'ner': 41965.642965581705}

Entities in 'Gianni Infantino is the president of FIFA.'
B-per Gianni
I-per Infantino
B-org FIFA
Saved model to /content/drive/MyDrive/Stat Software/Project/final-data
Loading from /content/drive/MyDrive/Stat Software/Project/final-data
B-per Gianni
I-per Infantino
B-org FIFA


## Inference

In [15]:
TEST_TEXT = ['The lands that today comprise Croatia were part of the Austro-Hungarian Empire until the close of World War I .',
             "Much of New Orleans sits below sea level , and the levees ' failure during Hurricane Katrina put 80 percent of the city underwater .",
             "A separate report says the number of people who lost jobs because of Hurricanes Katrina, Rita and Wilma now exceeds $ 6,00,000 .",
             "Google is in New York, London, Paris and Tokyo .",
             "Our homework is due on Wednesday, February 1st .",
             "Donald Trump is the president of United States .",
             "I love Indian food ."]

In [16]:
nlp = spacy.load(FINAL_DATA_DIR)

In [22]:
for t in TEST_TEXT:
  doc = nlp(t)
  displacy.render(doc, style='ent', jupyter=True)
  print("\n")





























In [34]:
1+1

2