In [2]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd
import torch

# Define model arguments
model_args = NERArgs()
#model_args.labels_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-EMAIL", "I-EMAIL", "B-QUANTITY", "I-QUANTITY", "B-PRICE", "I-PRICE"]
model_args.num_train_epochs = 10
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.save_steps = -1
model_args.overwrite_output_dir = True

# Initialize the NER model
model = NERModel(
    "roberta",
    "outputs_2",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

'''
If you haven't trained any model please use roberta-base. 
After that if you have a trained model stored as an output folder and wish to fine tune it replace the outputs_1 
as the new outputs folder or model folder other wise just use roberta-base
'''

"\nIf you haven't trained any model please use roberta-base. \nAfter that if you have a trained model stored as an output folder and wish to fine tune it replace the outputs_1 \nas the new outputs folder or model folder other wise just use roberta-base\n"

In [6]:
def get_unique_labels(file_path):
    unique_labels = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() and not line.startswith('-DOCSTART-'):
                _, _, _, ner = line.strip().split()
                unique_labels.add(ner)
    return unique_labels

train_labels = get_unique_labels('datasets/FIN5.txt')
valid_labels = get_unique_labels('datasets/FIN3.txt')
test_labels = get_unique_labels('datasets/FIN3.txt')

all_labels = train_labels.union(test_labels)
print(all_labels)


{'I-ORG', 'I-LOC', 'I-PER', 'O', 'I-MISC'}


In [7]:
def read_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r',encoding="utf-8") as f:
        sentence = []
        label = []
        for line in f:
            if line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                word, pos, chunk, ner = line.strip().split()
                sentence.append(word)
                label.append(ner)
        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

train_sentences, train_labels = read_data('datasets/FIN5.txt')
valid_sentences, valid_labels = read_data('datasets/FIN3.txt')
test_sentences, test_labels = read_data('datasets/FIN3.txt')

# Convert data to DataFrame
def convert_to_df(sentences, labels):
    data = []
    for i in range(len(sentences)):
        for j in range(len(sentences[i])):
            data.append([i, sentences[i][j], labels[i][j]])
    return pd.DataFrame(data, columns=["sentence_id", "words", "labels"])

train_df = convert_to_df(train_sentences, train_labels)
valid_df = convert_to_df(valid_sentences, valid_labels)
test_df = convert_to_df(test_sentences, test_labels)


In [8]:
model.train_model(train_df, eval_data=valid_df)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/74 [00:00<?, ?it/s]

(740, 0.0035629436349424665)

In [4]:
# Load the trained model
model = NERModel(
    "roberta",
    "outputs_3",
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

In [10]:
# Sample text to test the model
text_to_extract = """
"Good morning, this is Lisa from JKL Automotive. We would like to order 75 units of the high-performance brake pads, item number BP654, at $35 each. Can you ensure they are delivered to our main office at 789 Auto Drive by next Tuesday? Also, send the total cost and shipping details to my email at lisa@jklauto.com. We need these urgently, so please expedite the order. You can reach me directly at (555) 321-0987 for any questions. Appreciate it!"
"""

# Perform NER prediction
predictions, raw_outputs = model.predict([text_to_extract])

print(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'"Good': 'O'}, {'morning,': 'O'}, {'this': 'O'}, {'is': 'O'}, {'Lisa': 'I-PER'}, {'from': 'O'}, {'JKL': 'I-ORG'}, {'Automotive.': 'I-ORG'}, {'We': 'O'}, {'would': 'O'}, {'like': 'O'}, {'to': 'O'}, {'order': 'O'}, {'75': 'O'}, {'units': 'O'}, {'of': 'O'}, {'the': 'O'}, {'high-performance': 'O'}, {'brake': 'O'}, {'pads,': 'O'}, {'item': 'O'}, {'number': 'O'}, {'BP654,': 'O'}, {'at': 'O'}, {'$35': 'O'}, {'each.': 'O'}, {'Can': 'O'}, {'you': 'O'}, {'ensure': 'O'}, {'they': 'O'}, {'are': 'O'}, {'delivered': 'O'}, {'to': 'O'}, {'our': 'O'}, {'main': 'O'}, {'office': 'O'}, {'at': 'O'}, {'789': 'I-LOC'}, {'Auto': 'I-LOC'}, {'Drive': 'I-LOC'}, {'by': 'O'}, {'next': 'O'}, {'Tuesday?': 'O'}, {'Also,': 'O'}, {'send': 'O'}, {'the': 'O'}, {'total': 'O'}, {'cost': 'O'}, {'and': 'O'}, {'shipping': 'O'}, {'details': 'O'}, {'to': 'O'}, {'my': 'O'}, {'email': 'O'}, {'at': 'O'}, {'lisa@jklauto.com.': 'I-PER'}, {'We': 'O'}, {'need': 'O'}, {'these': 'O'}, {'urgently,': 'O'}, {'so': 'O'}, {'please': 'O'},

In [11]:
d={'I-ORG':"", 'I-LOC':"", 'O':"", 'I-PER':"", 'I-MISC':""}
for i in predictions[0]:
    t=list(i.keys());t1=list(i.values())
    k,v=t[0],t1[0]
    if v!="0" or v!="O":
        d[v]+=k+" "
for k,v in d.items():
    if k!='O':
        print(k," : ",v)

I-ORG  :  JKL Automotive. 
I-LOC  :  789 Auto Drive 
I-PER  :  Lisa lisa@jklauto.com. 
I-MISC  :  


In [5]:
# Sample text to test the model
text_to_extract = """
"Hi, this is Jane from ABC Manufacturing. I'd like to place an order for 50 units of your high-grade steel rods, item number ST123. Each unit is priced at $45. Please ensure the total cost is applied to our corporate account. We need these delivered to our warehouse at 123 Industrial Park by next Wednesday. Can you also include the quality certificates with the shipment? Please confirm the order and delivery date via email at jane.doe@abcmfg.com. Thank you!"
"""
# Perform NER prediction
predictions, raw_outputs = model.predict([text_to_extract])

d={'I-ORG':"", 'I-LOC':"", 'O':"", 'I-PER':"", 'I-MISC':""}
for i in predictions[0]:
    t=list(i.keys());t1=list(i.values())
    k,v=t[0],t1[0]
    if v!="0" or v!="O":
        d[v]+=k+" "
for k,v in d.items():
    if k!='O':
        print(k," : ",v)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

I-ORG  :  ABC Manufacturing. 
I-LOC  :  123 Industrial Park 
I-PER  :  Jane jane.doe@abcmfg.com. 
I-MISC  :  


In [6]:
# Sample text to test the model
text_to_extract = """
"Good afternoon, this is Mike from XYZ Construction. We need to order 100 gallons of industrial paint, item code IP456, priced at $30 per gallon. Could you expedite the shipping to ensure delivery by Friday to our site at 789 Construction Blvd? We also need a safety data sheet included. Please confirm the total amount and send the invoice to our accounting department at accounting@xyzcon.com. You can reach me at (555) 123-4567 if there are any issues. Thanks!"
"""
# Perform NER prediction
predictions, raw_outputs = model.predict([text_to_extract])

d={'I-ORG':"", 'I-LOC':"", 'O':"", 'I-PER':"", 'I-MISC':""}
for i in predictions[0]:
    t=list(i.keys());t1=list(i.values())
    k,v=t[0],t1[0]
    if v!="0" or v!="O":
        d[v]+=k+" "
for k,v in d.items():
    if k!='O':
        print(k," : ",v)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

I-ORG  :  XYZ Construction. 
I-LOC  :  789 Construction Blvd? 
I-PER  :  Mike 
I-MISC  :  


In [7]:
# Sample text to test the model
text_to_extract = """
"Hello, this is Sarah from DEF Tech. I'm calling to order 20 units of the advanced microprocessors, item ID MP789, at $200 each. We need these shipped to our main facility at 456 Tech Avenue by the end of the week. Could you ensure they are packaged securely to prevent any damage? Please charge this to our existing credit account and email me the confirmation and tracking information at sarah@deftech.com. My contact number is (555) 987-6543 if you need to reach me. Thanks!"
"""
# Perform NER prediction
predictions, raw_outputs = model.predict([text_to_extract])

d={'I-ORG':"", 'I-LOC':"", 'O':"", 'I-PER':"", 'I-MISC':""}
for i in predictions[0]:
    t=list(i.keys());t1=list(i.values())
    k,v=t[0],t1[0]
    if v!="0" or v!="O":
        d[v]+=k+" "
for k,v in d.items():
    if k!='O':
        print(k," : ",v)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

I-ORG  :  DEF Tech. 
I-LOC  :  456 Tech Avenue 
I-PER  :  Sarah 
I-MISC  :  


In [8]:
# Sample text to test the model
text_to_extract = """
"Hi, this is Tom from GHI Logistics. I need to place an order for 150 units of your heavy-duty cargo straps, item code CS321, each priced at $12. Please process this order as soon as possible and confirm the delivery date. Ship them to our depot at 321 Freight Lane. We require a proof of delivery upon receipt. Please send the invoice to our finance team at finance@ghilogistics.com and confirm the order with me at tom@ghilogistics.com or (555) 654-3210. Thanks!"
"""
# Perform NER prediction
predictions, raw_outputs = model.predict([text_to_extract])

d={'I-ORG':"", 'I-LOC':"", 'O':"", 'I-PER':"", 'I-MISC':""}
for i in predictions[0]:
    t=list(i.keys());t1=list(i.values())
    k,v=t[0],t1[0]
    if v!="0" or v!="O":
        d[v]+=k+" "
for k,v in d.items():
    if k!='O':
        print(k," : ",v)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

I-ORG  :  GHI Logistics. 
I-LOC  :  321 Freight Lane. 
I-PER  :  Tom tom@ghilogistics.com 
I-MISC  :  
