In [53]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


In [54]:
file_path = 'RawData/CheqMay23-June24.csv'
df = pd.read_csv(file_path)
pd.set_option('display.max_colwidth', 79)
df.head(30)

Unnamed: 0,Date,Transaction,Debit,Credit,account
0,2024-06-28,Branch Transaction SERVICE CHARGE CAPPED MONTHLY FEE$16.95 RECORD-KEEPING N/A,16.95,0.0,chequing
1,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001980 WALMART STORE #,22.24,0.0,chequing
2,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001590 WALMART STORE #,16.83,0.0,chequing
3,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001358 WALMART STORE #,3.3,0.0,chequing
4,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001351 WALMART STORE #,4.34,0.0,chequing
5,2024-06-27,Point of Sale - Interac RETAIL PURCHASE 000001001037 WALMART STORE #,6.81,0.0,chequing
6,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001808 WALMART STORE #,7.37,0.0,chequing
7,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001009 WALMART STORE #,5.9,0.0,chequing
8,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 417808109688 MCDONALD'S #400,1.33,0.0,chequing
9,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001772 WALMART STORE #,9.94,0.0,chequing


In [55]:
# Prepare the training data
def prepare_training_data(df):
    TRAIN_DATA = []
    for _, row in df.iterrows():
        text = row['Transaction']
        entities = []
        words = text.split()
        for i, word in enumerate(words):
            if word.isupper() and i > 0 and words[i-1].isdigit():
                start = text.index(words[i-1])
                end = start + len(words[i-1]) + 1 + len(word)  # Including space
                entities.append((start, end, "VENDOR"))
        if entities:
            TRAIN_DATA.append((text, {"entities": entities}))
    return TRAIN_DATA


In [56]:
TRAIN_DATA = prepare_training_data(df)
TRAIN_DATA

[('Point of Sale - Interac RETAIL PURCHASE 000001001980 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001590 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001358 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001351 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001037 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001808 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001009 WALMART STORE #',
  {'entities': [(40, 60, 'VENDOR')]}),
 ("Point of Sale - Interac RETAIL PURCHASE 417808109688 MCDONALD'S #400",
  {'entities': [(40, 63, 'VENDOR')]}),
 ('Point of Sale - Interac RETAIL PURCHASE 000001001772 WALMART STORE #',
  {'entities': [(40, 6

In [57]:
# Split data into training and evaluation sets
train_data, eval_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

In [58]:
# Load the spaCy model
nlp = spacy.blank("en")

In [59]:
# Create a new NER component
ner = nlp.add_pipe("ner")

In [60]:
# Add the labels to the NER component
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [61]:
# Disable other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Losses at iteration {itn}: {losses}")

Losses at iteration 0: {'ner': 638.2162051789711}
Losses at iteration 1: {'ner': 94.31793750502288}
Losses at iteration 2: {'ner': 44.920556184678006}
Losses at iteration 3: {'ner': 27.522480645738796}
Losses at iteration 4: {'ner': 11.79882140821707}
Losses at iteration 5: {'ner': 18.27317260512596}
Losses at iteration 6: {'ner': 10.937629911489877}
Losses at iteration 7: {'ner': 3.562948391863481}
Losses at iteration 8: {'ner': 8.355990328526751}
Losses at iteration 9: {'ner': 2.588236901588966}


In [62]:
# Save the trained model
nlp.to_disk("ner_model")

In [63]:
# Load the trained model
nlp = spacy.load("ner_model")

In [79]:
# Function to predict vendor names in a transaction text and extract only the vendor names
def predict_vendor(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "VENDOR":
            # Remove the digits and keep only the vendor name
            vendor_name = " ".join([word for word in ent.text.split() if not word.isdigit()])
            return vendor_name
    return None

In [65]:
# Evaluation function
def evaluate_model(nlp, eval_data):
    true_entities = []
    pred_entities = []
    for text, annotations in eval_data:
        doc = nlp(text)
        true_entities.extend([(ent[0], ent[1], ent[2]) for ent in annotations['entities']])
        pred_entities.extend([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents if ent.label_ == "VENDOR"])

    y_true = [1 if ent in true_entities else 0 for ent in pred_entities]
    y_pred = [1] * len(pred_entities)
    
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return precision, recall, f1
# Evaluate the model
precision, recall, f1 = evaluate_model(nlp, eval_data)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")

Precision: 1.00, Recall: 1.00, F1-score: 1.00


In [80]:
# Example usage
transaction = "Point of Sale - Interac RETAIL PURCHASE 000001001590 WALMART STORE #	"
print(predict_vendor(transaction))

WALMART


In [81]:
# Apply the prediction function to each row in the dataframe and store the results in a new column
df['vendors'] = df['Transaction'].apply(lambda x: predict_vendor(x))

# Display the first few rows of the updated dataframe
df.head(50)


Unnamed: 0,Date,Transaction,Debit,Credit,account,vendors
0,2024-06-28,Branch Transaction SERVICE CHARGE CAPPED MONTHLY FEE$16.95 RECORD-KEEPING N/A,16.95,0.0,chequing,
1,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001980 WALMART STORE #,22.24,0.0,chequing,WALMART
2,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001590 WALMART STORE #,16.83,0.0,chequing,WALMART
3,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001358 WALMART STORE #,3.3,0.0,chequing,WALMART
4,2024-06-28,Point of Sale - Interac RETAIL PURCHASE 000001001351 WALMART STORE #,4.34,0.0,chequing,WALMART
5,2024-06-27,Point of Sale - Interac RETAIL PURCHASE 000001001037 WALMART STORE #,6.81,0.0,chequing,WALMART
6,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001808 WALMART STORE #,7.37,0.0,chequing,WALMART
7,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001009 WALMART STORE #,5.9,0.0,chequing,WALMART
8,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 417808109688 MCDONALD'S #400,1.33,0.0,chequing,MCDONALD'S
9,2024-06-26,Point of Sale - Interac RETAIL PURCHASE 000001001772 WALMART STORE #,9.94,0.0,chequing,WALMART
