In [48]:
from transformers import pipeline

commands = [
    "zoom in to Delhi",
    "zoom out of Lucknow",
    "find directions to Bangalore",
    "navigate to New York",
    "look out for Dharwad"
]
ner = pipeline("ner", grouped_entities=True)


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
import re
def remove_ner_entities(command):
    ner_results = ner(command)
    # Extracting the entities
    entities = [entity['word'] for entity in ner_results]
    # Removing entities from command
    for entity in entities:
        command = command.replace(entity, '').strip()
    return command
cleaned_commands = []    
# Process commands
for command in commands:
    cleaned_command = remove_ner_entities(command)
    cleaned_command = re.sub(r'\bzoom in\b', 'zoom_in', cleaned_command, flags=re.IGNORECASE)    # Store cleaned command
    cleaned_command = re.sub(r'\bzoom out\b', 'zoom_out', cleaned_command, flags=re.IGNORECASE)    # Store cleaned command

    cleaned_commands.append(cleaned_command)

    print(f"Original Command: '{command}' => Cleaned Command: '{cleaned_command}'")

Original Command: 'zoom in to Delhi' => Cleaned Command: 'zoom_in to'
Original Command: 'zoom out of Lucknow' => Cleaned Command: 'zoom_out of'
Original Command: 'find directions to Bangalore' => Cleaned Command: 'find directions to'
Original Command: 'navigate to New York' => Cleaned Command: 'navigate to'
Original Command: 'look out for Dharwad' => Cleaned Command: 'look out for wad'


In [50]:
import nltk
from nltk.corpus import stopwords

# Download stop words once
nltk.download('stopwords')

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Function to remove stop words from a command
def remove_stopwords(command):
    words = command.split()  # Split the command into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stop words
    return " ".join(filtered_words)

# Function to remove stop words from a list of commands
def process_commands(commands):
    filtered_commands = [remove_stopwords(command) for command in commands]
    return filtered_commands

filtered_commands = process_commands(cleaned_commands)

# Print original and filtered commands
for original, filtered in zip(commands, filtered_commands):
    print(f"Original Command: {original}")
    print(f"Filtered Command (without stop words): {filtered}")
    print()


Original Command: zoom in to Delhi
Filtered Command (without stop words): zoom_in

Original Command: zoom out of Lucknow
Filtered Command (without stop words): zoom_out

Original Command: find directions to Bangalore
Filtered Command (without stop words): find directions

Original Command: navigate to New York
Filtered Command (without stop words): navigate

Original Command: look out for Dharwad
Filtered Command (without stop words): look wad



[nltk_data] Downloading package stopwords to /home/sds/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(word):
    # Tokenize and get embeddings for a word
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    # The BERT model returns multiple embeddings; we take the mean of the last hidden state
    return torch.mean(outputs.last_hidden_state, dim=1).detach().numpy()

def classify_command_with_best_match(command):
    # Define target keywords and their respective actions
    target_keywords = {
        "zoom_in": "Zoom_IN",
        "zoom_out": "Zoom_OUT",
        "directions": "Directions",
        "search": "Search"
    }
    
    # Get embeddings for all target keywords
    target_embeddings = {word: get_embedding(word) for word in target_keywords}
    
    # Split the command into words
    words_in_command = command.split()

    # Initialize an empty list to store the similarity matrix
    similarity_matrix = []

    # Store the best matching word for each target keyword
    best_matches = {key: {"word": None, "similarity": 0.0} for key in target_keywords}

    # Loop over each word in the command
    for word in words_in_command:
        word_embedding = get_embedding(word)
        
        # List to store similarities for the current word
        word_similarities = []
        
        # Compare the word's embedding with all target embeddings
        for target_word, target_embedding in target_embeddings.items():
            similarity = cosine_similarity(word_embedding, target_embedding)[0][0]  # Extract scalar value
            word_similarities.append(similarity)  # Add similarity score to the list for this word

            # Check if this word is the best match for the current target_word
            if similarity > best_matches[target_word]["similarity"]:
                best_matches[target_word]["similarity"] = similarity
                best_matches[target_word]["word"] = word
        
        # Add the similarity list for the current word to the matrix
        similarity_matrix.append(word_similarities)
    
    # Return the similarity matrix, target keywords, and the best match for each target keyword
    return similarity_matrix, list(target_keywords.keys()), best_matches

# Example usage with a list of commands
commands =filtered_commands

# Run the classification with best match detection


In [53]:
filtered_commands

['zoom_in', 'zoom_out', 'find directions', 'navigate', 'look wad']

In [71]:
for command in commands:
    similarity_matrix, target_words, best_matches = classify_command_with_best_match(command)
    print(similarity_matrix)
    print(target_words)
    print(best_matches)
    print("\n")

[[0.9999999, 0.9160377, 0.5723879, 0.6037229]]
['zoom_in', 'zoom_out', 'directions', 'search']
{'zoom_in': {'word': 'zoom_in', 'similarity': 0.9999999}, 'zoom_out': {'word': 'zoom_in', 'similarity': 0.9160377}, 'directions': {'word': 'zoom_in', 'similarity': 0.5723879}, 'search': {'word': 'zoom_in', 'similarity': 0.6037229}}


[[0.9160377, 1.0000001, 0.56481737, 0.6461116]]
['zoom_in', 'zoom_out', 'directions', 'search']
{'zoom_in': {'word': 'zoom_out', 'similarity': 0.9160377}, 'zoom_out': {'word': 'zoom_out', 'similarity': 1.0000001}, 'directions': {'word': 'zoom_out', 'similarity': 0.56481737}, 'search': {'word': 'zoom_out', 'similarity': 0.6461116}}


[[0.6297178, 0.64237964, 0.8288053, 0.88562953], [0.5723879, 0.56481737, 1.0000002, 0.7762666]]
['zoom_in', 'zoom_out', 'directions', 'search']
{'zoom_in': {'word': 'find', 'similarity': 0.6297178}, 'zoom_out': {'word': 'find', 'similarity': 0.64237964}, 'directions': {'word': 'directions', 'similarity': 1.0000002}, 'search': {'word':

In [72]:
    best_matches.items()

dict_items([('zoom_in', {'word': 'look', 'similarity': 0.6087385}), ('zoom_out', {'word': 'look', 'similarity': 0.6257511}), ('directions', {'word': 'look', 'similarity': 0.82745343}), ('search', {'word': 'look', 'similarity': 0.8460685})])

In [86]:
for command in commands:
    similarity_matrix, target_words, best_matches = classify_command_with_best_match(command)

    highest_similarity = 0
    best_match_target_word = None  # Store the target word instead of the actual command

    print(f"\nCommand: {command}")
    
    for target_word, match_info in best_matches.items():
        # Print out the match information for each target word
        
        # Check for the highest similarity
        if match_info['similarity'] > highest_similarity:
            highest_similarity = match_info['similarity']
            best_match_target_word = target_word  # Update to store the target word

    # Print the target word with the highest similarity for this command
    print(f"The command with the highest similarity for '{command}': '{best_match_target_word}' (Similarity: {highest_similarity:.4f})")



Command: zoom_in
The command with the highest similarity for 'zoom_in': 'zoom_in' (Similarity: 1.0000)

Command: zoom_out
The command with the highest similarity for 'zoom_out': 'zoom_out' (Similarity: 1.0000)

Command: find directions
The command with the highest similarity for 'find directions': 'directions' (Similarity: 1.0000)

Command: navigate
The command with the highest similarity for 'navigate': 'search' (Similarity: 0.8668)

Command: look wad
The command with the highest similarity for 'look wad': 'search' (Similarity: 0.8461)
