In [2]:
import json
import nltk
import string
import re
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
def data_preprocess(description):
    description = description.lower()
    description = re.sub(r"<[^>]+>", "", description)
    tokens = nltk.word_tokenize(description)
    filtered_tokens = [token for token in tokens if token not in string.punctuation]
    stopwords = nltk.corpus.stopwords.words("english")
    filtered_tokens = [token for token in filtered_tokens if token.lower() not in stopwords]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    tagged_tokens = nltk.pos_tag(lemmatized_tokens)
    return tagged_tokens


In [5]:
def get_bert_embeddings(tokens):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = TFBertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(tokens, return_tensors='tf', padding=True, truncation=True, is_split_into_words=True)
    outputs = model(inputs)
    pooled_output = outputs.last_hidden_state[:, 0, :]

    return pooled_output

In [6]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [7]:
proxy_dataset = {
    "Users are reporting issues with the application's login process. When attempting to log in with their credentials, the system fails to authenticate and displays an error message. This problem seems to occur intermittently across different devices and browsers. The login button appears unresponsive in some instances, preventing users from accessing their accounts. We need to investigate and resolve this issue promptly to ensure seamless user experience and maintain customer satisfaction.":
    "Check the authentication server logs and ensure the server is running. Restart the server if necessary. Verify that the login endpoint is properly configured and not experiencing downtime. Test the login process across various devices and browsers to identify any specific issues. Consider implementing a backup authentication server to handle login requests if the primary server fails.",

    "Users are experiencing delays when trying to upload files to the application. The upload process takes an unusually long time, and in some cases, it fails completely without any error message. This issue is affecting productivity as users rely on file uploads for their daily tasks.":
    "Inspect the server load and bandwidth usage to identify any bottlenecks. Ensure the file upload service is optimized for handling large files. Check for any network issues that might be causing delays. Implement error handling to provide feedback to users when uploads fail. Consider using a content delivery network (CDN) to improve upload speeds.",

    "The application's search functionality is returning incomplete or incorrect results. Users are unable to find the information they need, which is impacting their ability to perform their tasks effectively. This issue seems to be related to the indexing of the search data.":
    "Verify that the search indexing service is running correctly and is up-to-date. Rebuild the search index to ensure all data is accurately indexed. Check for any errors in the search algorithm that might be causing incorrect results. Enhance the search functionality by implementing advanced search filters and options.",

    "Customers are complaining about receiving duplicate email notifications from the system. This issue is causing confusion and frustration among users as they receive multiple emails for the same event or action.":
    "Review the email notification system to identify any configuration errors. Ensure that duplicate notifications are not being triggered by multiple events. Implement checks to prevent the same notification from being sent more than once. Monitor the email queue and logs to detect and resolve any issues promptly.",

    "The application's user interface is not displaying correctly on certain mobile devices. Some elements are misaligned, and the layout appears broken, making it difficult for users to navigate and use the application effectively.":
    "Test the application's user interface on various mobile devices to identify specific issues. Adjust the CSS and responsive design settings to ensure compatibility with different screen sizes. Fix any misaligned elements and broken layouts. Implement a mobile-first design approach to improve usability on mobile devices.",

    "Users are reporting that the application crashes unexpectedly during usage. This issue occurs sporadically and does not follow a predictable pattern, making it difficult to identify the root cause.":
    "Collect and analyze crash logs to identify any common patterns or errors. Ensure that all dependencies and libraries are up-to-date and compatible. Implement additional logging and monitoring to capture detailed information about the crashes. Perform thorough testing to replicate and resolve the issue.",

    "The application's performance has degraded significantly during peak usage times. Users experience slow response times and lag, which affects their ability to use the application efficiently.":
    "Evaluate the application's performance metrics to identify bottlenecks. Optimize the database queries and server configuration to handle higher loads. Implement load balancing to distribute traffic evenly across servers. Consider scaling up the infrastructure to accommodate peak usage times.",

    "Users are unable to reset their passwords using the password recovery feature. The system does not send the password reset email, leaving users unable to regain access to their accounts.":
    "Verify that the email service is configured correctly and is operational. Check the logs for any errors related to the password reset process. Ensure that the password reset tokens are being generated and sent correctly. Provide users with alternative methods for password recovery if needed.",

    "The application's API is returning errors when third-party services attempt to connect. This issue is preventing integrations with other systems and affecting functionality that relies on external data.":
    "Review the API documentation and ensure that all endpoints are correctly implemented. Check the API logs for any errors or issues. Verify that the API keys and authentication methods are working as expected. Test the API integration with third-party services to identify and resolve any compatibility issues.",

    "Users are experiencing issues with the application's payment processing. Transactions are failing, and users are unable to complete their purchases, leading to lost sales and revenue.":
    "Inspect the payment gateway configuration to ensure it is set up correctly. Check the transaction logs for any errors or failed attempts. Verify that the payment gateway is not experiencing downtime or technical issues. Implement fallback payment options to ensure users can complete their purchases."
}

In [8]:
embeddings_dict = {}
for query, solution in proxy_dataset.items():
    preprocessed_query = data_preprocess(query)
    query_embedding = get_bert_embeddings(preprocessed_query)
    embeddings_dict[query] = {
        "embedding": query_embedding,
        "solution": solution
    }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
print(embeddings_dict)

{"Users are reporting issues with the application's login process. When attempting to log in with their credentials, the system fails to authenticate and displays an error message. This problem seems to occur intermittently across different devices and browsers. The login button appears unresponsive in some instances, preventing users from accessing their accounts. We need to investigate and resolve this issue promptly to ensure seamless user experience and maintain customer satisfaction.": {'embedding': <tf.Tensor: shape=(45, 768), dtype=float32, numpy=
array([[-0.2904942 , -0.04644941,  0.09957866, ..., -0.30919862,
         0.23370537,  0.7566181 ],
       [-0.3806938 ,  0.14694709, -0.23550798, ...,  0.00903959,
        -0.00651567,  0.23851702],
       [-0.6959207 , -0.20002419, -0.06681649, ..., -0.0264198 ,
         0.16593844,  0.542137  ],
       ...,
       [ 0.01390023,  0.05583616, -0.14046824, ..., -0.28249547,
        -0.19555138,  0.92566043],
       [-0.3124373 ,  0.045

In [11]:
print(type(embeddings_dict))

<class 'dict'>


In [12]:
import faiss
import numpy as np
dimension = 768  # BERT base embedding dimension
index = faiss.IndexFlatL2(dimension)

In [13]:
index.reset()

In [14]:
embedding_vectors = []
solutions = []

In [15]:
for query, data in embeddings_dict.items():
    if query in proxy_dataset:
        embedding = data['embedding'].numpy()
        averaged_embedding = np.mean(embedding, axis=0, keepdims=True) #taking mean of all berts

        solution = proxy_dataset[query]

        embedding_vectors.append(averaged_embedding)
        solutions.append(solution)

In [16]:
print(f"Number of valid embeddings before stacking: {len(embedding_vectors)}")
print(f"Number of valid solutions: {len(solutions)}")

Number of valid embeddings before stacking: 10
Number of valid solutions: 10


In [17]:
if embedding_vectors:
    embedding_vectors = np.vstack(embedding_vectors)
    print(f"embedding shape: {embedding_vectors.shape}")
    index.add(embedding_vectors)
    print(f"Number of vectors in the FAISS: {index.ntotal}")
else:
    print("No valid embeddings found to add to the FAISS index.")

embedding shape: (10, 768)
Number of vectors in the FAISS: 10


In [18]:
def preprocess_and_embed(query):
    preprocessed_query = data_preprocess(query)
    tokens = [" ".join([word for word, tag in preprocessed_query])]
    embedding = get_bert_embeddings(tokens)
    return embedding

In [21]:
def find_solution(query):
    preprocessed_query = data_preprocess(query)
    query_embedding = get_bert_embeddings(preprocessed_query).numpy()
    if query_embedding.shape[1] != 768:
        raise ValueError(f"Query embedding has incorrect shape: {query_embedding.shape}")

    # Search in FAISS index
    distances, indices = index.search(query_embedding, 1)

    print(f"Distances found: {distances}")
    print(f"Indices found: {indices}")
    if len(solutions) <= indices[0][0]:
        raise IndexError(f"Index {indices[0][0]} is out of range for solutions list of length {len(solutions)}.")
    solution = solutions[indices[0][0]]
    return solution

In [22]:
new_query = "Users are unable to log in to the application, facing intermittent issues."
solution = find_solution(new_query)
ans=solution

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Distances found: [[12.798648 ]
 [12.367964 ]
 [26.413021 ]
 [ 7.6448293]
 [15.994676 ]
 [11.970928 ]
 [18.052631 ]]
Indices found: [[7]
 [4]
 [1]
 [8]
 [4]
 [4]
 [3]]


In [23]:
print(ans)

Verify that the email service is configured correctly and is operational. Check the logs for any errors related to the password reset process. Ensure that the password reset tokens are being generated and sent correctly. Provide users with alternative methods for password recovery if needed.
