In [1]:
import spacy
from spacy.tokens import DocBin
import pandas as pd
import re
pd.set_option('display.max_colwidth', 200)

# funcs

In [3]:
def massage_data(address):
    '''Pre process address string to remove new line characters, add comma punctuations etc.'''
    cleansed_address1=re.sub(r'(,)(?!\s)',', ',address)
    cleansed_address2=re.sub(r'(\\n)',', ',cleansed_address1)
    cleansed_address3=re.sub(r'(?!\s)(-)(?!\s)',' - ',cleansed_address2)
    cleansed_address=re.sub(r'\.','',cleansed_address3)
    return cleansed_address

def get_address_span(address=None,address_component=None,label=None):
    '''Search for specified address component and get the span.
    Eg: get_address_span(address="221 B, Baker Street, London",address_component="221",label="BUILDING_NO") would return (0,2,"BUILDING_NO")'''

    if pd.isna(address_component) or str(address_component)=='nan':
        pass
    else:
        address_component1=re.sub('\.','',address_component)
        address_component2=re.sub(r'(?!\s)(-)(?!\s)',' - ',address_component1)
        span=re.search('\\b(?:'+address_component2+')\\b',address)
        #print(span)
        return (span.start(),span.end(),label)

def extend_list(entity_list,entity):
    if pd.isna(entity):
        return entity_list
    else:
        entity_list.append(entity)
        return entity_list

def create_entity_spans(df,tag_list):
    '''Create entity spans for training/test datasets'''
    df['Address']=df['Address'].apply(lambda x: massage_data(x))
    df["BuildingTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Name'],label='BUILDING_NAME'),axis=1)
    df["BuildingNoTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Number'],label='BUILDING_NO'),axis=1)
    df["RecipientTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Recipient'],label='RECIPIENT'),axis=1)
    df["StreetNameTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Street_Name'],label='STREET_NAME'),axis=1)
    df["ZipCodeTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Zip_Code'],label='ZIP_CODE'),axis=1)
    df["CityTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['City'],label='CITY'),axis=1)
    df["CountryTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Country'],label='COUNTRY'),axis=1)
    df["StateTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['State'],label='STATE'),axis=1)
    df['EmptySpan']=df.apply(lambda x: [], axis=1)

    for i in tag_list:
        df['EntitySpans']=df.apply(lambda row: extend_list(row['EmptySpan'],row[i]),axis=1)
        df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
    return df['EntitySpans']

def get_doc_bin(training_data,nlp):
    '''Create DocBin object for building training/test corpus'''
    # the DocBin will store the example documents
    db = DocBin()
    for text, annotations in training_data:
        doc = nlp(text) #Construct a Doc object
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        db.add(doc)
    return db

# bod

In [6]:
import pandas as pd

df_train=pd.read_csv(filepath_or_buffer="/content/us-train-dataset.csv",sep=",",dtype=str)

In [7]:
df_train.head()

Unnamed: 0,Address,Building_Name,Building_Number,City,Recipient,Street_Name,Zip_Code,State,Country
0,"19 ST ANDREW ST, BULRINGTON, VT, 05401,, United States",,19.0,BULRINGTON,,ST ANDREW ST,5401,VT,United States
1,"2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, United States",,2574.0,CHATTANOOGA,,EAST 23RD STREE,37404,TN,United States
2,"5931 W ANGELA RD, MEMPHIS, TN 38120, United States",,5931.0,MEMPHIS,,W ANGELA RD,38120,TN,United States
3,"3812 MYERS STREET, GREENEVILLE, TN 37743, United States",,3812.0,GREENEVILLE,,MYERS STREET,37743,TN,United States
4,"HWY 33 BY-PASS BOX, DYERSBURG, TN 38024, United States",,,DYERSBURG,,HWY 33 BY-PASS,38024,TN,United States


In [8]:
len(df_train)

120

In [9]:
tag_list=["BuildingTag","BuildingNoTag","RecipientTag","StreetNameTag","ZipCodeTag","CityTag","StateTag","CountryTag"]

In [34]:
nlp = spacy.blank("ru")

In [1]:
import spacy
from spacy.cli import download

download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
print(type(nlp('qeqwqw')))
print(nlp('qeqwqw'))

<class 'spacy.tokens.doc.Doc'>
qeqwqw


In [13]:
df_entity_spans= create_entity_spans(df_train.astype(str),tag_list)

  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)


In [14]:
df_entity_spans.head()

Unnamed: 0,EntitySpans
0,"(19 ST ANDREW ST, BULRINGTON, VT, 05401, , United States, [(0, 2, BUILDING_NO), (3, 15, STREET_NAME), (33, 38, ZIP_CODE), (17, 27, CITY), (29, 31, STATE), (42, 55, COUNTRY)])"
1,"(2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, United States, [(0, 4, BUILDING_NO), (5, 20, STREET_NAME), (38, 43, ZIP_CODE), (22, 33, CITY), (35, 37, STATE), (45, 58, COUNTRY)])"
2,"(5931 W ANGELA RD, MEMPHIS, TN 38120, United States, [(0, 4, BUILDING_NO), (5, 16, STREET_NAME), (30, 35, ZIP_CODE), (18, 25, CITY), (27, 29, STATE), (37, 50, COUNTRY)])"
3,"(3812 MYERS STREET, GREENEVILLE, TN 37743, United States, [(0, 4, BUILDING_NO), (5, 17, STREET_NAME), (35, 40, ZIP_CODE), (19, 30, CITY), (32, 34, STATE), (42, 55, COUNTRY)])"
4,"(HWY 33 BY - PASS BOX, DYERSBURG, TN 38024, United States, [(0, 16, STREET_NAME), (36, 41, ZIP_CODE), (22, 31, CITY), (33, 35, STATE), (43, 56, COUNTRY)])"


In [15]:
training_data= df_entity_spans.values.tolist()


In [16]:
training_data

[('19 ST ANDREW ST, BULRINGTON, VT, 05401, , United States',
  [(0, 2, 'BUILDING_NO'),
   (3, 15, 'STREET_NAME'),
   (33, 38, 'ZIP_CODE'),
   (17, 27, 'CITY'),
   (29, 31, 'STATE'),
   (42, 55, 'COUNTRY')]),
 ('2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, United States',
  [(0, 4, 'BUILDING_NO'),
   (5, 20, 'STREET_NAME'),
   (38, 43, 'ZIP_CODE'),
   (22, 33, 'CITY'),
   (35, 37, 'STATE'),
   (45, 58, 'COUNTRY')]),
 ('5931 W ANGELA RD, MEMPHIS, TN 38120, United States',
  [(0, 4, 'BUILDING_NO'),
   (5, 16, 'STREET_NAME'),
   (30, 35, 'ZIP_CODE'),
   (18, 25, 'CITY'),
   (27, 29, 'STATE'),
   (37, 50, 'COUNTRY')]),
 ('3812 MYERS STREET, GREENEVILLE, TN 37743, United States',
  [(0, 4, 'BUILDING_NO'),
   (5, 17, 'STREET_NAME'),
   (35, 40, 'ZIP_CODE'),
   (19, 30, 'CITY'),
   (32, 34, 'STATE'),
   (42, 55, 'COUNTRY')]),
 ('HWY 33 BY - PASS BOX, DYERSBURG, TN 38024, United States',
  [(0, 16, 'STREET_NAME'),
   (36, 41, 'ZIP_CODE'),
   (22, 31, 'CITY'),
   (33, 35, 'STATE'),
   (43, 56, '

In [17]:
doc_bin_train= get_doc_bin(training_data,nlp)
list(doc_bin_train.get_docs(nlp.vocab))

[19 ST ANDREW ST, BULRINGTON, VT, 05401, , United States,
 2574 EAST 23RD STREE, CHATTANOOGA, TN 37404, United States,
 5931 W ANGELA RD, MEMPHIS, TN 38120, United States,
 3812 MYERS STREET, GREENEVILLE, TN 37743, United States,
 HWY 33 BY - PASS BOX, DYERSBURG, TN 38024, United States,
 423 TRENT STATE STRE, DOVER, DE 19001, United States,
 269 ABBOTT MT ROAD, CHATTANOOGA, TN 37405, United States,
 HIGHWAY 11 - E, RUSSELLVILLE, TN 00000, United States,
 8725 HWY 62 N, MILLINGTON, TN 38053, United States,
 6081 SUMMER AVENUE, MEMPHIS, TN 38134, United States,
 STE 363 8 REGENCY SQ, KNOXVILLE, TN 37915, United States,
 NAT TRUST CENTER 1612 REDWOOD STREET, WILMINGTON, DE 19801, United States,
 PO BOX 631, GIDDINGS, TX 78942, United States,
 9281 S MAIN ST PO BOX 284, SPRINGFIELD, TN 37172, United States,
 PO BOX 3028 OMOHUN, NASHVILLE, TN 37210, United States,
 375 MCDONNELL BLVD - PO BOX 5840, ST LOUIS, MO 63134, United States,
 PO BOX 724, TRUSSVILLE, AL 35173, United States,
 PO BOX

In [18]:
doc_bin_train.to_disk("./train.spacy")

In [19]:
df_test=pd.read_csv(filepath_or_buffer="/content/us-test-dataset.csv",sep=",",dtype=str)

In [20]:
df_entity_spans= create_entity_spans(df_test.astype(str),tag_list)
validation_data= df_entity_spans.values.tolist()

  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
  df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)


In [21]:
validation_data

[('223 NW STATE STRE, DOVER, DE 19001, United States',
  [(0, 3, 'BUILDING_NO'),
   (4, 17, 'STREET_NAME'),
   (29, 34, 'ZIP_CODE'),
   (19, 24, 'CITY'),
   (26, 28, 'STATE'),
   (36, 49, 'COUNTRY')]),
 ('216 LIBERTY MT ROAD, CHATTANOOGA, TN 37405, United States',
  [(0, 3, 'BUILDING_NO'),
   (4, 19, 'STREET_NAME'),
   (37, 42, 'ZIP_CODE'),
   (21, 32, 'CITY'),
   (34, 36, 'STATE'),
   (44, 57, 'COUNTRY')]),
 ('HIGHWAY 11 - E, RUSSELLVILLE, TN 00000, United States',
  [(0, 14, 'STREET_NAME'),
   (33, 38, 'ZIP_CODE'),
   (16, 28, 'CITY'),
   (30, 32, 'STATE'),
   (40, 53, 'COUNTRY')]),
 ('7913 HWY 51 N, MILLINGTON, TN 38053, United States',
  [(0, 4, 'BUILDING_NO'),
   (5, 13, 'STREET_NAME'),
   (30, 35, 'ZIP_CODE'),
   (15, 25, 'CITY'),
   (27, 29, 'STATE'),
   (37, 50, 'COUNTRY')]),
 ('PO BOX 481, BALDWIN, GA, 30511 - 0484, , United States',
  [(25, 37, 'ZIP_CODE'),
   (12, 19, 'CITY'),
   (21, 23, 'STATE'),
   (41, 54, 'COUNTRY')]),
 ('PO BOX 457, NORTONVILLE, KS 660600452, United St

In [22]:
doc_bin_test= get_doc_bin(validation_data,nlp)
doc_bin_test.to_disk("/content/test.spacy")

# train

In [None]:
!pip install spacy-transformers  # если хочешь использовать трансформеры

In [24]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy
!python -m spacy init config config.cfg --lang en --pipeline ner
#!python -m spacy init fill-config config\base_config.cfg config\config.cfg

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [25]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     66.65    6.46    4.40   12.21    0.06
 11     200        163.72   3067.98   98.11   97.01   99.24    0.98
 25     400         13.50     11.77   96.58   96.21   96.95    0.97
 42     600         10.37      6.80   98.48   97.74   99.24    0.98
 63     800          0.01      0.01   98.11   97.01   99.24    0.98
 89    1000          0.00      0.00   98.11   97.01   99.24    0.98
120    1200          0.00      0.00   98.48   97.74   99.24    0.98
158    1400        706.57    248.65   98.11   97.01   99.24    0.98
205    1600         52.76     22.48   97.73   96.99   98.47    0.9

# eval

In [32]:
import spacy

nlp = spacy.load("./output/model-best")

address_list=["130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA",
              "8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA",
              "PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA",
              "C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA",
              "Seven Edgeway Plaza, C/O Mac Dermott Inc, OAKBROOK TERRACE, IL, 60181, USA"]

for address in address_list:
    doc=nlp(address)
    ent_list=[(ent.text, ent.label_) for ent in doc.ents]
    print("Address string -> "+address)
    print("Parsed address -> "+str(ent_list))
    print("******")

Address string -> 130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA
Parsed address -> [('130', 'BUILDING_NO'), ('W BOSE ST', 'STREET_NAME'), ('PARK RIDGE', 'CITY'), ('IL', 'STATE'), ('60068', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> 8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA
Parsed address -> [('8311', 'BUILDING_NO'), ('MCDONALD RD', 'STREET_NAME'), ('HOUSTON', 'CITY'), ('TX', 'STATE'), ('77053-4821', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA
Parsed address -> [('4100', 'BUILDING_NO'), ('Hwy 20 E', 'STREET_NAME'), ('NICEVILLE', 'CITY'), ('FL', 'STATE'), ('32578-5037', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA
Parsed address -> [('C/O Elon Musk Innovations Inc', 'RECIPIENT'), ('1548', 'BUILDING_NO'), ('E Florida Avenue', 'STREET_NAME'), ('TAMPA', 'CITY'), ('FL', 'STATE'), ('33613', 'Z