In [1]:
import spacy
from spacy.training import Example
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.util import filter_spans
import random
from pathlib import Path
from itertools import chain

In [2]:
TRAIN_DATA = [
    ("I want to play Minecraft", {"entities": [(15, 24, "GAME")]}),
    ("Start playing Minecraft with my right hand", {"entities": [(14, 23, "GAME"), (32, 42, "HAND")]}),
    ("place a block down", {"entities": []}),
    ("I love Minecraft", {"entities": []}),
    ("hold my fist to place a block down", {"entities": [(8, 12, "GESTURE")]}),
    ("Let's play Rocket League", {"entities": [(11, 24, "GAME")]}),
    ("I use the joystick to drive in Horizon", {"entities": [(31, 38, "GAME"), (22, 27, "GESTURE")]}),
    ("FIFA is best played with a controller", {"entities": [(0, 4, "GAME"), (27, 37, "GESTURE")]}),
    ("Playing Tetris requires quick rotations", {"entities": [(8, 14, "GAME")]}),
    ("Cast a spell in Final Fantasy", {"entities": [(16, 29, "GAME")]}),
    ("Score a goal in FIFA using my feet", {"entities": [(16, 20, "GAME")]}),
    ("Build a tower in Minecraft with blocks", {"entities": [(17, 26, "GAME")]}),
    ("Launch the ball in Rocket League with a flip", {"entities": [(19, 32, "GAME")]}),
    ("Rotate pieces in Tetris quickly", {"entities": [(17, 23, "GAME"), (0, 6, "GESTURE")]}),
    ("Drive through the field in Horizon with speed", {"entities": [(27, 34, "GAME"), (0, 5, "GESTURE")]}),
    ("Perform a free kick in FIFA", {"entities": [(23, 27, "GAME"), (10, 19, "GESTURE")]}),
    ("Summon your ally in Final Fantasy", {"entities": [(20, 33, "GAME")]}),
]


In [20]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from pathlib import Path

def train_ner(model_dir="./ner_model", new_data=TRAIN_DATA, n_iter=10):
    # Load the pre-trained "en_core_web_sm" model
    if Path(model_dir).exists():
        print(f"Loading existing model from: {model_dir}")
        nlp = spacy.load(model_dir)  # Load the existing model
    else:
        print("Loading the pre-trained 'en_core_web_sm' model")
        nlp = spacy.load("en_core_web_sm")  # Load the pre-trained model if none exists
        if "ner" not in nlp.pipe_names:
            ner = nlp.add_pipe("ner", last=True)
        else:
            ner = nlp.get_pipe("ner")
        # Add new entity labels to the NER model
        for _, annotations in new_data:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

    # Continue training the model
    if "ner" in nlp.pipe_names:
        ner = nlp.get_pipe("ner")
    
    # Disable other pipeline components during training to focus training on the NER component
    with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
        optimizer = nlp.resume_training()
        for itn in range(n_iter):
            losses = {}
            batches = minibatch(new_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.5, losses=losses, sgd=optimizer)
            print(f"Losses at iteration {itn}: {losses}")

    # Save the updated model
    nlp.to_disk(model_dir)
    print(f"Saved model to: {model_dir}")

if __name__ == "__main__":
    train_ner()


Loading the pre-trained 'en_core_web_sm' model
Losses at iteration 0: {'ner': 38.84356198094276}
Losses at iteration 1: {'ner': 37.36295401562617}
Losses at iteration 2: {'ner': 35.138318952281}
Losses at iteration 3: {'ner': 31.302118829939445}
Losses at iteration 4: {'ner': 28.114366880750822}
Losses at iteration 5: {'ner': 22.810956803546105}
Losses at iteration 6: {'ner': 19.101992157757024}
Losses at iteration 7: {'ner': 22.699482739957016}
Losses at iteration 8: {'ner': 19.316348836047005}
Losses at iteration 9: {'ner': 19.348964436276777}
Losses at iteration 10: {'ner': 15.071955375100451}
Losses at iteration 11: {'ner': 10.314000480588433}
Losses at iteration 12: {'ner': 12.355362414821819}
Losses at iteration 13: {'ner': 6.429380289445243}
Losses at iteration 14: {'ner': 6.787670726955305}
Losses at iteration 15: {'ner': 6.288120121424518}
Losses at iteration 16: {'ner': 5.009038514117714}
Losses at iteration 17: {'ner': 3.152630977118878}
Losses at iteration 18: {'ner': 5.320

Losses at iteration 152: {'ner': 1.5006812643122097e-06}
Losses at iteration 153: {'ner': 0.36617435432415935}
Losses at iteration 154: {'ner': 4.785157607960906e-10}
Losses at iteration 155: {'ner': 2.0322615682867977}
Losses at iteration 156: {'ner': 0.001551886632788568}
Losses at iteration 157: {'ner': 0.012182387042223944}
Losses at iteration 158: {'ner': 1.1616918644281524e-08}
Losses at iteration 159: {'ner': 0.22900598879804238}
Losses at iteration 160: {'ner': 9.498381067012785e-08}
Losses at iteration 161: {'ner': 4.801329190902113e-09}
Losses at iteration 162: {'ner': 0.00025933650643049914}
Losses at iteration 163: {'ner': 2.630231781675002e-05}
Losses at iteration 164: {'ner': 7.701292894871316e-05}
Losses at iteration 165: {'ner': 7.70642589298445e-09}
Losses at iteration 166: {'ner': 9.223506415247357e-07}
Losses at iteration 167: {'ner': 0.001180002961483534}
Losses at iteration 168: {'ner': 1.0185602398868738e-05}
Losses at iteration 169: {'ner': 7.4086630298678595e-09

In [34]:
import spacy
import json
def predict(text):
    nlp = spacy.load("./ner_model")  # Ensure this path is correct
    doc = nlp(text)
    for token in doc:
        print(token.text, token.ent_type_)
    for ent in doc.ents:
        print("Entity:", ent.text, ent.label_)

if __name__ == "__main__":
    predict_text = "I like Fornite"
    predict_to_json(predict_text, "prediction_output.json")
    predict(predict_text)


I 
like 
Fornite 


In [43]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Note: you may need to restart the kernel to use updated packages.Looking in indexes: https://download.pytorch.org/whl/cu118

Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.2.1%2Bcu118-cp39-cp39-win_amd64.whl (2704.2 MB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.17.1%2Bcu118-cp39-cp39-win_amd64.whl (4.9 MB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.2.1%2Bcu118-cp39-cp39-win_amd64.whl (4.0 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.2.1+cu118 torchaudio-2.2.1+cu118 torchvision-0.17.1+cu118


In [5]:
# Import necessary libraries
from transformers import AutoModel, AutoTokenizer
import torch
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
import pandas as pd

# Simulated dataset: Normally, you'd load a dataset from a file
data = [
    {"sentence1": "move the ball", "sentence2": "pass the ball", "similarity": 0.9},
    {"sentence1": "move the ball", "sentence2": "walk", "similarity": 0.1},
    {"sentence1": "move the ball", "sentence2": "punt the ball", "similarity": 0.8},
    {"sentence1": "move the ball", "sentence2": "run", "similarity": 0.2},
]

# Convert the simulated dataset to a pandas DataFrame
df = pd.DataFrame(data)

# Convert the DataFrame to a list of InputExample objects
examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity']) for index, row in df.iterrows()]

# Initialize DataLoader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=2)

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the training method using cosine similarity loss
train_loss = losses.CosineSimilarityLoss(model=model)

# Assuming a very small dataset, let's skip validation for this example.
# In a real scenario, you should split your data and use a validation set.

# Fine-tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=4,  # Adjust epochs based on your dataset size and complexity
          warmup_steps=100,
          output_path="./fine-tuned-model"  # Change this to where you want to save your model
         )

# Load the fine-tuned model (optional if you continue using the same model object)
model = SentenceTransformer("./fine-tuned-model")

# Now you can use the model as before to generate embeddings and calculate similarities
# The target phrase and phrases to compare
target_phrase = "move the ball"
phrases = ["walk", "pass", "punt", "run","kick"]

# Generate embeddings for each phrase
target_embedding = model.encode(target_phrase)
phrase_embeddings = model.encode(phrases)

# Calculate and print the cosine similarity between the target and each phrase
similarities = {}
for phrase, embedding in zip(phrases, phrase_embeddings):
    # Compute cosine similarity (note: 1 - cosine distance to get similarity)
    similarity = 1 - cosine(target_embedding, embedding)
    similarities[phrase] = similarity

# Find the most similar phrase
most_similar_phrase = max(similarities, key=similarities.get)
print(f"The phrase most similar to '{target_phrase}' is: '{most_similar_phrase}' with a similarity score of {similarities[most_similar_phrase]:.4f}")

# Optional: print all similarities for comparison
for phrase, similarity in similarities.items():
    print(f"Similarity to '{phrase}': {similarity:.4f}")


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

The phrase most similar to 'move the ball' is: 'punt' with a similarity score of 0.4493
Similarity to 'walk': 0.3580
Similarity to 'pass': 0.3633
Similarity to 'punt': 0.4493
Similarity to 'run': 0.3016
Similarity to 'kick': 0.4475
