In [1]:
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
def data_formatter(sentences, tag_list):
    final_data = []

    for  (tags, sentence) in zip(tags_list, sentences):
        data = []
        entities = {"entities": []}
        start = 0
        end = 0
        data.append(" ".join(sentence))
        for tag, word in zip(tags, sentence):
            end += len(word)
            if tag != "O":
                entities["entities"].append((start, end, tag))
            start += len(word) + 1
            end += 1
        data.append(entities)
        final_data.append(data)
    return final_data

In [3]:
df = pd.read_csv("ner_dataset.csv", encoding="latin-1")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
print(df["Tag"].unique())

['O' 'B-geo' 'B-gpe' 'B-per' 'I-geo' 'B-org' 'I-org' 'B-tim' 'B-art'
 'I-art' 'I-per' 'I-gpe' 'I-tim' 'B-nat' 'B-eve' 'I-eve' 'I-nat']


In [5]:
df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
sentences = df.groupby('Sentence #')['Word'].apply(list).values
tags_list = df.groupby('Sentence #')["Tag"].apply(list).values

In [6]:
print(sentences[0], tags_list[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'] ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [7]:
#Req Data Format: 
# [
#     'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 
#     {'entities': [(48, 54, 'B-geo'), (77, 81, 'B-geo'), (111, 118, 'B-gpe')]}
# ]


formatted_data = data_formatter(sentences, tags_list)
formatted_data[0]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
 {'entities': [(48, 54, 'B-geo'), (77, 81, 'B-geo'), (111, 118, 'B-gpe')]}]

In [8]:
train, test = train_test_split(formatted_data, test_size=0.3)

In [9]:
pickle.dump(train, open("assets/train.pickle", "wb"))
pickle.dump(test, open("assets/val.pickle", "wb"))

In [10]:
!spacy project run all

[38;5;4mℹ Running workflow 'all'[0m
[1m
Running command: /usr/bin/python3 scripts/preprocess.py assets/train.pickle corpus/train.spacy
Running command: /usr/bin/python3 scripts/preprocess.py assets/val.pickle corpus/val.spacy
[1m
Running command: /usr/bin/python3 -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/val.spacy
[38;5;4mℹ Saving to output directory: training[0m
[38;5;4mℹ Using CPU[0m
[1m
[2021-11-14 20:29:50,078] [INFO] Set up nlp object from config
[2021-11-14 20:29:50,088] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-11-14 20:29:50,102] [INFO] Created vocabulary
[2021-11-14 20:29:50,105] [INFO] Finished initializing nlp object
[2021-11-14 20:30:19,671] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --

[38;5;2m✔ Saved pipeline to output directory[0m
training/model-last
[1m
Running command: /usr/bin/python3 -m spacy evaluate training/model-best corpus/val.spacy --output training/metrics.json
[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   84.17 
NER R   82.86 
NER F   83.51 
SPEED   32941 

[1m

             P       R       F
B-org    77.04   69.83   73.26
B-geo    83.25   91.19   87.04
B-tim    92.43   88.31   90.32
I-org    78.93   75.58   77.22
I-tim    84.86   72.01   77.91
B-gpe    95.26   93.05   94.14
B-per    82.00   79.27   80.61
I-per    82.81   89.14   85.86
I-geo    79.64   76.62   78.10
B-art     0.00    0.00    0.00
B-nat     0.00    0.00    0.00
I-gpe   100.00   38.71   55.81
B-eve    85.71   22.64   35.82
I-eve     0.00    0.00    0.00
I-art     0.00    0.00    0.00
I-nat     0.00    0.00    0.00

[38;5;2m✔ Saved results to training/metrics.json[0m
