In [1]:
import warnings
warnings.filterwarnings("ignore")

## <center> BERT Based - NER </center>

In [2]:
import json
with open("Data.txt", "r") as f:
    # Read the entire file content and join lines into a single string
    cars_data = json.loads("".join(f.readlines()))

In [6]:
from tqdm.notebook import tqdm

def CreateMetadataandTextfromJSON(json_data):
    vehicle_data = {
        "vin": json_data['vin'],
        "vehicle_info": {
            "vehicle_serial_number": json_data['serialNbr'],
            "vehicle_body_style_description": json_data['bodyStyleDesc'],
            "vehicle_brand": json_data['brand'],
            "marketing_grade_code": json_data['marketingGrade']['code'],
            "marketing_grade_title": json_data['marketingGrade']['title']
        },
        "engine_info": {
            "engine_code": json_data['engine']['engineCd'],
            "engine_number": json_data['engine']['engineNbr'],
            "engine_name": json_data['engine']['name'],
            "engine_fuel_type": json_data['engine']['fuelType'],
            "engine_cylinders_count": json_data['engine']['noOfCylinders'],
            "engine_horsepower": json_data['engine']['horsepower']
        },
        "price_info": {
            "price_optional_total_msrp": json_data['price']['optTotalMsrp'],
            "price_total_msrp": json_data['price']['totalMsrp'],
            "price_base_msrp": json_data['price']['baseMsrp'],
            "price_ppo_holdback": json_data['price']['ppoHoldback']
        },
        "color_info": {
            "interior_color_code": json_data['intColor']['colorCd'],
            "interior_color_nvs_name": json_data['intColor']['nvsName'],
            "exterior_color_code": json_data['extColor']['colorCd'],
            "exterior_color_nvs_name": json_data['extColor']['nvsName'],
            "exterior_color_hex_code": json_data['extColor']['colorHexCd'],
            "exterior_color_common_name_display": json_data['extColor']['commonName']['display'],
            "exterior_color_common_name_generic": json_data['extColor']['commonName']['generic'],
            "exterior_color_common_name_specific": json_data['extColor']['commonName']['specific']
        },
        "drivetrain_info": {
            "drivetrain_code": json_data['drivetrain']['code'],
            "drivetrain_title": json_data['drivetrain']['title']
        }
    }

    return vehicle_data

In [13]:
def CategoryField(data):
    res = {}
    for key in data.keys():
        # Initialize the string to accumulate data
        combined_string = ''
        
        # Check if the field is a dictionary, meaning it contains subfields
        if isinstance(data[key], dict):
            for sub_key in data[key].keys():
                try:
                    # Add the subfield value to the combined string
                    combined_string += " " + str(data[key][sub_key])
                except:
                    print(f"Not Data Found for {sub_key}")
        
        # If combined_string has content, store it in the result dictionary
        if combined_string:
            res[key] = combined_string.strip()  # Strip to remove leading/trailing spaces
            
        # Optionally print the key and combined value for debugging
        # print(f"Key: {key}, Combined String: {combined_string}")
    
    return res  # Return the result dictionary after processing

In [14]:
from tqdm.notebook import tqdm

In [59]:
vehicle_info = ''
engine_info = ''
color_info = ''
drivetrain_info = ''


for idx in tqdm(range(len(cars_data['items']))):
    try:
        temp_data = CreateMetadataandTextfromJSON(cars_data['items'][idx])
        vehicle_info += ' ' + CategoryField(temp_data)['vehicle_info']
        engine_info += ' ' + CategoryField(temp_data)['engine_info']
        color_info += ' ' + CategoryField(temp_data)['color_info']
        drivetrain_info +=' ' + CategoryField(temp_data)['drivetrain_info']
    except:
        continue

  0%|          | 0/600 [00:00<?, ?it/s]

In [117]:
un = [set(vehicle_info.split(" "))]

In [29]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "nomic-ai/nomic-embed-text-v1.5",
    trust_remote_code=True,
    device="cpu", 
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False, "cache_folder":"./"}
)

<All keys matched successfully>


In [60]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_lg")

def CleanQuery(text):
    # Process the query with spaCy NLP pipeline
    doc = nlp(text)

    # Remove stop words
    filtered_tokens = [token.text for token in doc if not token.is_stop]

    # Join the filtered tokens into a cleaned query
    filtered_query = " ".join(filtered_tokens)
    
    return filtered_query

In [61]:
vehicle_embed = model.encode(vehicle_info).tolist()
engine_embed = model.encode(engine_info).tolist()
color_embed = model.encode(color_info).tolist()
drivetrain_embed = model.encode(drivetrain_info).tolist()

In [47]:
query = "Show the Camry XSE Engine 2024 FWD in white with price less than 40000 Engine G46 042035YFT4MCE9RP2 and 616073"

In [56]:
FilteredQuery = set(CleanQuery(query).split(" "))
print("Token Count: {}".format(len(FilteredQuery)))

Token Count: 11


In [54]:
QueryTokens = []

for token in FilteredQuery:
    QueryTokens.append({
        "token":token,
        "embedding":model.encode(token).tolist()
    })

In [97]:
def cosine_similarity(vec1, vec2):
    """
    Calculates the cosine similarity between two vectors.

    Args:
        vec1: A NumPy array representing the first vector.
        vec2: A NumPy array representing the second vector.

    Returns:
        The cosine similarity score between the two vectors.
    """

    dot_product = np.dot(vec1, vec2.T)  # Transpose vec2 for valid matrix multiplication
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2, axis=1)  # Calculate norms for each row of vec2
    return dot_product / (norm_vec1 * norm_vec2)

In [98]:
import numpy as np
import pandas as pd

In [109]:
vec1 = np.matrix([drivetrain_embed,color_embed, engine_embed, vehicle_embed])
vec1.shape

(4, 768)

In [110]:
idx

3

In [111]:
idx = 3
print("Token: {}".format(QueryTokens[idx]['token']))
vec2 = np.matrix(QueryTokens[idx]['embedding'])
vec2.shape

Token: XSE


(1, 768)

In [112]:
cosine_similarity(vec1, vec2)

matrix([[0.24409141],
        [0.13395017],
        [0.17470156],
        [0.15273087]])

In [128]:
from transformers import BertTokenizer, AutoModelForSequenceClassification
import torch

In [129]:
# Define the path to your checkpoint folder
checkpoint_path = "BERT/checkpoint-3500/"  # Adjust this to your folder's location

# Replace the tokenizer with a pretrained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load your fine-tuned model from the checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


# Set the model to evaluation mode
model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BERT/checkpoint-3500/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [137]:
label_map = {"O": 0, "color_info": 1, "vehicle_info": 2, "engine_info": 3, "price_info": 4, "drivetrain_info": 5, "vin": 6}

In [149]:
# Example text input for prediction
input_text = "Hycross"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)

# The outputs include logits. Apply softmax to get probabilities if needed
logits = outputs.logits
predictions = torch.softmax(logits, dim=1)

# Print the predictions
print("Logits:", logits)
print("Predicted probabilities:", predictions)

# If it's a classification model, get the predicted label
predicted_label = torch.argmax(predictions, dim=1).item()
print("Predicted label:", predicted_label)

Logits: tensor([[ 0.1715, -0.0677, -0.4886,  0.1582, -0.0631, -0.5511, -0.0083]])
Predicted probabilities: tensor([[0.1851, 0.1457, 0.0957, 0.1827, 0.1464, 0.0899, 0.1546]])
Predicted label: 0


In [150]:
label_map

{'O': 0,
 'color_info': 1,
 'vehicle_info': 2,
 'engine_info': 3,
 'price_info': 4,
 'drivetrain_info': 5,
 'vin': 6}

### Key Val Pair RnD

* Handle Fuzzy
    - brand - TOYOTA
    - grade - xse - > XSE/VS SE
    - drivetrain - FWD/Front-Wheel Drive
    - marketingTitle - Prius Prime XSE 2.0L 4-Cyl. Plug-in Hybrid Engine Front-Wheel Drive
    - text

* Workflow
    - Query -> Remove Stop Words -> Search In MasterData for Key Val -> UnMatch Tokens get into -> Fuzzy - Fuzzy Results -> Final Query [Match  Tokens from Master Data + Fuzzy Resulst] -> Regex -> KeyVal Pair