In [None]:

from __future__ import unicode_literals, print_function
import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

from sklearn.utils import shuffle
import random
from pathlib import Path
from collections import Counter

from matplotlib import pyplot as plt
%matplotlib inline

import pprint
pp = pprint.PrettyPrinter(indent=4)

import en_core_web_sm as en


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
addresses_df = pd.read_csv('/content/drive/MyDrive/NLP_NER_Project/data/adresser.csv')

addresses_df.replace('', np.nan, inplace=True)

addresses_df = shuffle(addresses_df)

addresses_df.head(n=10)


Unnamed: 0,road code,Street Name,addressing road name,house number,Additional city name,postnr,Postal Code Name,storm recipient zip code,stormodtagerpostnrnavn,municipal code,municipality name,owner name,region code,region name,jordstykke_ejerlavnavn (City Name),Municipality
23047,1250,Nyvangsvej,Nyvangsvej,20,,4400,Kalundborg,,,326,Kalundborg,Kalundborg Markjorder,1085,Region Sjælland,Kalundborg Markjorder,Kalundborg
31254,1694,Svalevej,Svalevej,6,,4270,Høng,,,326,Kalundborg,"Høng By, Finderup",1085,Region Sjælland,"Høng By, Finderup",Kalundborg
33350,1845,Tranevej,Tranevej,40,,4270,Høng,,,326,Kalundborg,"Høng By, Finderup",1085,Region Sjælland,"Høng By, Finderup",Kalundborg
18463,980,Landervejen,Landervejen,76,Reersø Str,4281,Gørlev,,,326,Kalundborg,"Reersø By, Kirke Helsinge",1085,Region Sjælland,"Reersø By, Kirke Helsinge",Kalundborg
13125,722,Højstedvej,Højstedvej,20,,4591,Føllenslev,,,326,Kalundborg,"Højsted By, Bregninge",1085,Region Sjælland,"Højsted By, Bregninge",Kalundborg
7992,427,Frederiksberg,Frederiksberg,1A,,4470,Svebølle,,,326,Kalundborg,"Avnsøgård Hgd., Avnsø",1085,Region Sjælland,"Avnsøgård Hgd., Avnsø",Kalundborg
6724,372,Falkensøvej,Falkensøvej,12A,Kirke Helsinge,4281,Gørlev,,,326,Kalundborg,"Kirke Helsinge By, Kirke Helsinge",1085,Region Sjælland,"Kirke Helsinge By, Kirke Helsinge",Kalundborg
14097,787,Josefinevej,Josefinevej,6,Kaldred,4593,Eskebjerg,,,326,Kalundborg,"Torpe By, Bregninge",1085,Region Sjælland,"Torpe By, Bregninge",Kalundborg
20142,1064,Lyngstræde,Lyngstræde,4,Havnsø,4591,Føllenslev,,,326,Kalundborg,"Havnsø By, Føllenslev",1085,Region Sjælland,"Havnsø By, Føllenslev",Kalundborg
26679,1462,Sandlodsvej,Sandlodsvej,43,,4400,Kalundborg,,,326,Kalundborg,"Nostrup By, Raklev",1085,Region Sjælland,"Nostrup By, Raklev",Kalundborg


In [None]:
pp.pprint(addresses_df.shape)

(37288, 16)


In [None]:
training_data = []

# define new entity label for street address (STREET_NAME)
label = 'STREET_NAME'

sentences_templates = ["I live in {0}", "Friend of mine lives in {0}", "I know the address of that street, it is {0}",\
    "Mr. Absalon Adam lived before in {0}", "I like the resturants in {0}", "Check the map to find the directions to {0}", \
    "My friend Aksel will meet me in {0}", "Me and Adrian has a meeting in {0}", "Taxi driver can take you to {0}", \
      "Stay away of {0}", "His address is {0}", "My cousine lives in {0}", "I like shops in {0}", "Let us drive to {0}", \
      "Do you like this street?", "I walk everyday in that place", "{0}, this is my current address", "{0} is awesome place"
]

# prepare the new training dataset based on the data collected from the street addresses in Kalundborg city in Denmark
def prepare_training_data():

    for index, item in addresses_df.iterrows():
        sentence_pholder_idx = np.random.randint(0, len(sentences_templates), size=1)[0]
        sentence_pholder = sentences_templates[sentence_pholder_idx]

        street_address = item['Street Name'] + " " + str(item['road code'])
        if '{0}' in sentence_pholder:
            if item['Additional city name'] != '':
                if item['Additional city name'] != np.nan and str(item['Additional city name']) != 'nan':
                    street_address = street_address + ", "+ item['Additional city name']

            street_address = street_address + ", "+ item['Municipality']
            start_idx = sentence_pholder.find('{0}')
            new_sentence = sentence_pholder.replace('{0}', street_address)
            end_idx = start_idx + len(street_address)

            training_data.append((new_sentence, {
            'entities': [(start_idx, end_idx, label)]
            }))
        else:
            new_sentence = sentence_pholder

            training_data.append((new_sentence, {
            'entities': []
            }))

    return training_data

dataset = prepare_training_data()

'''
# Add few random examples for the existing entity types to our training dataset
dataset.insert(10, ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
}))

dataset.insert(300, ('Who is Bill Gates?', {
        'entities': [(7, 17, 'PERSON')]
}))

dataset.insert(600, ('Steve Jobs was a genius but work holic.', {
        'entities': [(0, 10, 'PERSON')]
}))

dataset.insert(1200, ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    }))

dataset.insert(1800, ('I like Cairo and Turkey.', {
        'entities': [(7, 12, 'LOC'), (18, 24, 'LOC')]
    }))
'''

# I will consider only 4000 instances during the training of the NER model (just to expedite the training process in that demo scenario)
train_dataset = dataset[:4000]

# pick 500 instaces as our test dataset
test_dataset = dataset[10000:10500]

# View 10 sentences from the new training dataset
train_dataset[1000:1010]

[('I walk everyday in that place', {'entities': []}),
 ('Bredgade 175, Kalundborg, this is my current address',
  {'entities': [(0, 24, 'STREET_NAME')]}),
 ('Stay away of Klostermosen 2106, Kalundborg',
  {'entities': [(13, 42, 'STREET_NAME')]}),
 ('His address is Skovvænget 1531, Kalundborg',
  {'entities': [(15, 42, 'STREET_NAME')]}),
 ('My cousine lives in Rynkehaven 2122, Kalundborg',
  {'entities': [(20, 47, 'STREET_NAME')]}),
 ('His address is Strandgårdsvej 1660, Havnsø Str, Kalundborg',
  {'entities': [(15, 58, 'STREET_NAME')]}),
 ('Møllehøjvej 1185, Kærby, Kalundborg is awesome place',
  {'entities': [(0, 35, 'STREET_NAME')]}),
 ('Mr. Absalon Adam lived before in Nyvangsvej 1250, Kalundborg',
  {'entities': [(33, 60, 'STREET_NAME')]}),
 ('Let us drive to Skovager 1910, Rørby, Kalundborg',
  {'entities': [(16, 48, 'STREET_NAME')]}),
 ('Fredskovens Engvej 433, Bjerge Sydstrand, Kalundborg is awesome place',
  {'entities': [(0, 52, 'STREET_NAME')]})]

In [None]:
def train_NER_model(train_data, new_model_name, output_dir, n_iter=20):
    # Initialize a blank spaCy model or load an existing one
    nlp = spacy.blank("en")

    # Add NER component to the pipeline
    ner = nlp.add_pipe("ner", last=False)

    # Add labels to the NER component
    for label in labels:
        ner.add_label(label)

    # Prepare training data
    examples = []
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)

    # Disable other pipelines during training
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    # Train only NER
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            losses = {}
            # Shuffle examples before training
            random.shuffle(examples)
            for batch in spacy.util.minibatch(examples, size=32):
                nlp.update(batch, drop=0.5, losses=losses, sgd=optimizer)
            print("Iteration: {} Loss: {}".format(itn, losses))

    # Save the trained model
    nlp.to_disk(output_dir)
    print("Model saved to:", output_dir)

# Example usage:
train_data = [
    ("Some text here", {"entities": [(0, 4, "STREET_NAME")]})
]
output_dir = '/content/drive/MyDrive/NLP_NER_Project/output'

train_NER_model(train_data, new_model_name='street_names_model', output_dir=output_dir, n_iter=20)

def load_trained_NER_model(output_dir):
    nlp = spacy.load(output_dir)
    return nlp

def test_NER_model(nlp, test_doc):
    doc = nlp(test_doc)
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

# Assuming the functions are defined above

# load the trained model
nlp_trained_model = load_trained_NER_model(output_dir=output_dir)


# test the trained model
def test_NER_model(nlp_model, document_test, show_entities=True, style_sentence=True):
    document = nlp_model(document_test)

    if show_entities:
      for entities in document.ents:
          print(entities.label_, entities.text)

    if style_sentence:
      colors = {'STREET_NAME': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
      displacy.render(document, jupyter=True, style='ent', options={'colors': colors})

# choose random instances batch from the testing dataset to evaluate the trained NER model against unseen instances.
def test_samples_batch(model, test_set=test_dataset, samples_no=10):

    for i in range(0, samples_no):
      test_sample_idx = np.random.randint(0, len(test_set), size=1)[0]
      test_doc = test_set[test_sample_idx][0]

      test_NER_model(model, test_doc, show_entities=False)



Iteration: 0 Loss: {'ner': 2.6999998092651367}
Iteration: 1 Loss: {'ner': 2.6539031267166138}
Iteration: 2 Loss: {'ner': 2.6053884029388428}
Iteration: 3 Loss: {'ner': 2.5473920106887817}
Iteration: 4 Loss: {'ner': 2.4876153469085693}
Iteration: 5 Loss: {'ner': 2.4646793007850647}
Iteration: 6 Loss: {'ner': 2.285447359085083}
Iteration: 7 Loss: {'ner': 2.315528690814972}
Iteration: 8 Loss: {'ner': 2.019570827484131}
Iteration: 9 Loss: {'ner': 2.106557548046112}
Iteration: 10 Loss: {'ner': 2.0149401426315308}
Iteration: 11 Loss: {'ner': 1.723852127790451}
Iteration: 12 Loss: {'ner': 1.81505686044693}
Iteration: 13 Loss: {'ner': 1.7802753448486328}
Iteration: 14 Loss: {'ner': 1.5521973371505737}
Iteration: 15 Loss: {'ner': 1.6055408120155334}
Iteration: 16 Loss: {'ner': 1.3540944755077362}
Iteration: 17 Loss: {'ner': 1.4553938210010529}
Iteration: 18 Loss: {'ner': 1.1950233280658722}
Iteration: 19 Loss: {'ner': 1.3749442100524902}
Model saved to: /content/drive/MyDrive/NLP_NER_Project/ou

In [None]:
# load the trained model
nlp_trained_model = load_trained_NER_model(output_dir=output_dir)

# test the trained model
sample_docs = ["Mr. Absalon Adam lived before in Sine Olsensvej 1479, Svallerup, Kalundborg",
               "I live currently in Absalonsvej 4, Bjerge Str, Copenhagen City, Denmark",
               "His address is Elledevej 317, Kalundborg",
               "Mr. Charles B. Evans is a consular officer, Department of State, Nouakchott Place 33, Washington DC",
               "I live in Union Square 23, San Francisco."]

for test_doc in sample_docs:
  test_NER_model(nlp_trained_model, test_doc)

STREET_NAME Sine


STREET_NAME ,


STREET_NAME is
STREET_NAME ,


STREET_NAME ,


In [None]:
test_samples_batch(nlp_trained_model, test_dataset)