In [1]:
# # Install Necessary Packages
# !pip install -U spacy
# !pip install spacy-lookups-data

In [2]:
import pandas as pd
import spacy
from spacy.training import Example
import random
import json
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
import warnings
# Suppress UserWarning from spaCy
warnings.filterwarnings("ignore", category=UserWarning)
# Load dataset
df = pd.read_csv('/kaggle/input/conll-transactions/annotated_transactions.csv')

In [3]:
# Function to remove overlapping entities
def remove_overlaps(entities):
    sorted_entities = sorted(entities, key=lambda e: e[0])
    non_overlapping_entities = []
    last_end = -1
    for start, end, label in sorted_entities:
        if start >= last_end:
            non_overlapping_entities.append((start, end, label))
            last_end = end
    return non_overlapping_entities

In [4]:
# Function to parse annotations and convert them to spaCy format
def parse_annotations(text, annotation_json):
    entities = []
    annotations = json.loads(annotation_json)
    for annotation in annotations:
        start = annotation['start']
        end = annotation['end']
        label = annotation['labels'][0]
        entities.append((start, end, label))
    return (text, {"entities": remove_overlaps(entities)})

In [5]:
# Split the dataset
train_indices = df.sample(frac=0.8, random_state=42).index
test_indices = df.index.difference(train_indices)

In [6]:
train_data = [parse_annotations(row['text'], row['label']) for _, row in df.loc[train_indices].iterrows()]
test_data = [parse_annotations(row['text'], row['label']) for _, row in df.loc[test_indices].iterrows()]

In [7]:
# Define the hyperparameter search space
hyperparameter_space = {
    'learning_rate': [0.001],
    'dropout': [0.5],
    'batch_size': [32],
    'epochs': [100]
}

best_model = None
best_score = 0.0

In [None]:
# Random search
for _ in range(4):  
    # Randomly sample hyperparameters from the search space
    hyperparameters = {
        'learning_rate': random.choice(hyperparameter_space['learning_rate']),
        'dropout': random.choice(hyperparameter_space['dropout']),
        'batch_size': random.choice(hyperparameter_space['batch_size']),
        'epochs': random.choice(hyperparameter_space['epochs'])
    }

    # Initialize blank English model
    nlp = spacy.blank("en")

    # Add the NER pipeline
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)  # Add NER to the pipeline

    # Add entity labels to NER
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])  # Add labels

    #  initialize the NER model before training
    nlp.initialize()

    # Training the NER model
    optimizer = nlp.begin_training()

    # Iterate over epochs
    for itn in range(hyperparameters['epochs']):
        random.shuffle(train_data)
        losses = {}

        # Batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., hyperparameters['batch_size'], 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            # Update the model with each example
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                example.append(Example.from_dict(doc, annotations[i]))
            nlp.update(example, drop=hyperparameters['dropout'], losses=losses)

        # Print the loss for each iteration
        print(f"Iteration {itn}, Losses: {losses}")


[2024-06-07 01:48:17,897] [INFO] Created vocabulary
[2024-06-07 01:48:17,899] [INFO] Finished initializing nlp object
[2024-06-07 01:48:17,985] [INFO] Created vocabulary
[2024-06-07 01:48:17,986] [INFO] Finished initializing nlp object


Iteration 0, Losses: {'ner': 3512.5801997977933}
Iteration 1, Losses: {'ner': 688.6714811632727}
Iteration 2, Losses: {'ner': 434.3008688483717}
Iteration 3, Losses: {'ner': 340.0586168058623}
Iteration 4, Losses: {'ner': 338.0330282848331}
Iteration 5, Losses: {'ner': 250.49893272804792}
Iteration 6, Losses: {'ner': 335.59012388698903}
Iteration 7, Losses: {'ner': 285.81241904914737}
Iteration 8, Losses: {'ner': 294.02123249997715}
Iteration 9, Losses: {'ner': 228.4792640380576}
Iteration 10, Losses: {'ner': 228.9052336738424}
Iteration 11, Losses: {'ner': 189.48829320832868}
Iteration 12, Losses: {'ner': 175.1593551433526}
Iteration 13, Losses: {'ner': 174.14461304779414}
Iteration 14, Losses: {'ner': 171.92541353640726}
Iteration 15, Losses: {'ner': 182.26094976526963}
Iteration 16, Losses: {'ner': 181.11814364237904}
Iteration 17, Losses: {'ner': 198.74080199235468}
Iteration 18, Losses: {'ner': 156.45967554244535}
Iteration 19, Losses: {'ner': 151.0060174244344}
Iteration 20, Loss

[2024-06-07 02:12:54,147] [INFO] Created vocabulary
[2024-06-07 02:12:54,148] [INFO] Finished initializing nlp object
[2024-06-07 02:12:54,219] [INFO] Created vocabulary
[2024-06-07 02:12:54,220] [INFO] Finished initializing nlp object


Iteration 0, Losses: {'ner': 3598.8656812410345}
Iteration 1, Losses: {'ner': 658.3558322167391}
Iteration 2, Losses: {'ner': 403.74719275085454}
Iteration 3, Losses: {'ner': 332.9022472837606}
Iteration 4, Losses: {'ner': 285.186128007819}
Iteration 5, Losses: {'ner': 262.38933404549107}
Iteration 6, Losses: {'ner': 264.5189173134098}
Iteration 7, Losses: {'ner': 271.79522954652396}
Iteration 8, Losses: {'ner': 227.93767994499328}
Iteration 9, Losses: {'ner': 277.4646287465672}
Iteration 10, Losses: {'ner': 263.3890384696252}
Iteration 11, Losses: {'ner': 182.45044952286887}
Iteration 12, Losses: {'ner': 191.1600735884892}
Iteration 13, Losses: {'ner': 190.33478325515804}
Iteration 14, Losses: {'ner': 206.7524435575789}
Iteration 15, Losses: {'ner': 163.6204605676376}
Iteration 16, Losses: {'ner': 185.77223867502838}
Iteration 17, Losses: {'ner': 175.2221985496436}
Iteration 18, Losses: {'ner': 182.70323980150317}
Iteration 19, Losses: {'ner': 185.21900744358638}
Iteration 20, Losses:

In [None]:
from spacy.scorer import Scorer

# Create a scorer object
scorer = Scorer()

# Iterate over the test data
for text, annotations in test_data:
    # Process the text with the trained model
    doc = nlp(text)
    
    # Create an Example object
    example = Example.from_dict(doc, annotations)
    
    # Update the scorer with the predictions from the model
    scores = scorer.score([example])  # Pass a list containing the single Example object

# 'scores' is now a dictionary with keys like 'ents_p', 'ents_r', 'ents_f' for entities' precision, recall, and F1-score
precision = scores['ents_p']
recall = scores['ents_r']
f1_score = scores['ents_f']
accuracy = scores['ents_acc']

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


In [None]:
# Save the Model
model_path = '/kaggle/working/ner_model'
nlp.to_disk(model_path)


In [None]:
# Zip the Model Directory
import shutil

# Create a zip file from the model directory
shutil.make_archive(model_path, 'zip', model_path)


In [None]:
# Test the model
import spacy

# Load the saved model
nlp2 = spacy.load(model_path)

# Example new test data
test_texts = [
"Purchase, card *5744.  1689 RUB.  EAPTEKA.  Available 190.15 RUB",
"VISA0610 15:43 Purchase 1313 rubles YANDEX EDA Balance: 317 rubles",
"VISA0610 14:04 Purchase 593 RUR TATMAK Balance: 2466.14 RUR",
"Payment 211.00rub Card*1835 UBER.COM Balance 6903.73rub 13:48",
"Payment 211.00 rub Card*1835 UBER.COM Balance 6903.73 rub 13:48",
"Payment 211.00 RUB Card*1835 UBER.COM Balance 6903.73 RUB 13:48",
"Payment 123.00r Card*1835 https://taxi.ya Balance 7114.73r 21:47",
"Payment 592.39 rubles Card*7293 PYATEROCHKA 519 Balance 193.73 rubles 18:43",
"Transfer 2200.00r. Account*1898 Rahib f.  Balance 23.28r 13:22",
"Write-off 989.00rub Card*2465 Card2Card Balance 29.74rub 19:16",
"Write-off 989.00 rub Card*2465 Card2Card Balance 29.74 rub 19:16",
"Withdrawal RUR 30,000.00 Card*7293 D. 17, UL.  HUSA Balance 1681.64 USD 18:47",
"Deposit 300.00 USD Card*7293 D. 17, UL.  HUSA Balance 2100.05 USD 18:44",
"Receipt 1800.00 USD Account*1080 from YEMEN REPUBLIC EMBASSY Balance 1800.05 USD 17:27",
"Receipt 1000.00r. Account*1898 from Abdulmajid Balance 1005.58r. 08:18",
"Withdrawal RUR 400.00 Card*2465 D. 84, UL.  OSTR Balance 6988.27r 05:09",
"Receipt 190.00 USD Account*1080 from Abdulmalek E Balance 190.51 USD 06:44",
"Transfer from AKHMED KHASAN 4800.00rub Card*2465 Balance 12201.11rub 13:46",
"VISA0610 02:09 Transfer 12.15 rubles from Tinkoff Bank from RAHIB IMAD FADL N. Balance: 12.15 rubles",
"VISA0610 16:05 Deposit 17,600 rub. ATM 60000722 Balance: 17,600.47 rub.",
"VISA0610 10:08 Transfer 1000 rubles from Hezam A. Balance: 1000.56 rubles",
"VISA0610 17:37 crediting 1500rub VTB Balance: 1547.53rub",
"VISA0610 17:49 crediting 4860r VTB Balance: 5166.02r",
"VISA0610 13:32 Transfer 9162r from Yana L. Balance: 9612.70r",
"VISA0610 00:50 crediting 2917 rubles Tinkoff Bank Balance: 6209.64 rubles",
"VISA0610 15:53   Purchase 1337 RUR KULINAR GURU BURGERS Balance: 4525.84 RUR",
"VISA0610 11:54 transfer 1142r TINKOFF Balance: 9040.56r",
"VISA0610 15:27 transfer 1200 rubles Balance: 12,034.56 rubles",
"Replenishment, account RUB.  317 RUB.  Rahib Imad Fadl N. Available 507.15 RUB",
"Replenishment, account RUB.  1000 RUB.  Khezam A. Available 1507.15 RUB",
"Replenishment, account RUB.  500 RUB.  Hezam Abdulrahman Hezam A. Available 5395.15 RUB",
"Translation.  RUB account.  500 RUB.  Rahib Imad Fadl N. Balance 7923.68 RUB"
]

# Evaluate the model on each test text
for test_text in test_texts:
    doc = nlp2(test_text)
    print(f"Text: {test_text}")

    # Initialize empty dictionary to store entities
    entities = {"AMOUNT": "", "CURRENCY": "", "MERCHANT": "", "CARD": "", "BALANCE": ""}

    # Iterate over the entities and fill the dictionary
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_] = ent.text

    # Print the entities
    for label, text in entities.items():
        if text:  # Only print if the entity was found
            print(f"{label} = {text}")

    print()