In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from qdrant_client import QdrantClient
from qdrant_client.http import models
import requests
import traceback


# Step 1: Read embeddings from CSV file
df = pd.read_csv('output_embeddings.csv') 

# Step 2: Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

# Step 3: Define function to get embeddings
def get_embeddings(text):
    # Tokenize input text
    input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
    # Generate embeddings
    with torch.no_grad():
        outputs = model(input_ids)
    # Take the output embeddings from the last layer
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Step 4: Preprocess the input question and convert it into embeddings
input_question = "can we trust chatgpt?"
input_question_embeddings = get_embeddings(input_question)

# Step 5: Calculate embeddings for each text entry in the DataFrame
df['text_embeddings'] = df['text'].apply(get_embeddings)

# Step 6: Calculate cosine similarity between the input question embeddings and each text entry embeddings
cos_sim_scores = cosine_similarity([input_question_embeddings], list(df['text_embeddings']))

# Step 7: Find the index of the most similar text entry based on cosine similarity
most_similar_index = np.argmax(cos_sim_scores)

# Step 8: Retrieve the most similar text and cosine similarity score
most_similar_text = df.loc[most_similar_index, 'text']
cosine_similarity_score = cos_sim_scores[0][most_similar_index]

# Step 9: Print the results
print(f"Input Question: {input_question}" "\n")
print(f"Most Similar Text Index: {most_similar_index}")
print(f"Most Similar Text: {most_similar_text}")
print(f"Cosine Similarity Score: {cosine_similarity_score}")


Input Question: can we trust chatgpt?

Most Similar Text Index: 79
Most Similar Text: Bug, Fault, Error, or Weakness: Demystifying Software Security Vulnerabilities -CONCLUSIONThe results from ChatGPT queries show we must rely on more than just AI to discern concepts. Underthe ChatGPT's hood lays a model that learns from all over the Internet, including misunderstandings. In parallel to the ancient Oracle of Delphi, the caller should be well prepared to provide the right questions and context; otherwise, the reasoning may be misleading and the result disastrous. Using our software security expertise to pose more and more tuned questions, thus providing more context, eventually we got ChatGPT to at least partially discern our own reasoning. The collective knowledge seems to approve the direction we are delving in via our BF research.
Cosine Similarity Score: 0.3531416058540344


In [7]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer
import uuid
import numpy as np

client = QdrantClient("localhost", port=6333)
MODEL = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Read CSV file into DataFrame
df = pd.read_csv('output_embeddings.csv')

# Specify the column containing text data
text_column_name = 'text'

# Create Qdrant collection
client.recreate_collection(
    collection_name="similar_text",
    vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE),
)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Get text from the specified column
    text = row[text_column_name]
    
    # Encode text using SentenceTransformer model
    text_embeddings = MODEL.encode(text).tolist()
    
    # Generate unique ID for each row
    id = str(uuid.uuid4())
    
    # Define payload
    payload = {"text": text, "text_embeddings": text_embeddings}
    
    # Upsert data into Qdrant collection
    client.upsert(
        collection_name="similar_text",
        wait=True,
        points=[
            models.PointStruct(id=id, vector=text_embeddings, payload=payload)
        ]
    )

print("Data upload completed.")

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Data upload completed.


In [7]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer
import uuid
import numpy as np

client = QdrantClient("localhost", port=6333)
MODEL = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Function to get embeddings for a question
def get_question_embeddings(question):
    question = question
    return MODEL.encode(question).tolist()

# Function to perform similarity search
def search_similar_text(question, collection_name="similar_text", limit=3):
    question_embeddings = get_question_embeddings(question)
    
    search_result = client.search(
        collection_name=collection_name,
        query_vector=question_embeddings,
        limit=limit  # Adjust the number of results to retrieve
    )
    
    return search_result

# Ask a question 
question = "what is a vulnerability?"
similar_texts = search_similar_text(question)

# Print the results with clear formatting
print(f"Input Question: {question}\n")
print("Similar Texts and Their Similarity Scores:\n")

for idx, result in enumerate(similar_texts):
    similar_text = result.payload["text"]
    similarity_score = result.score
    print(f"Result {idx + 1}:")
    print(f"Text: {similar_text}")
    print(f"Similarity Score: {similarity_score}""\n")

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Input Question: what is a vulnerability?

Similar Texts and Their Similarity Scores:

Result 1:
Text: A DECADE OF REOCCURRING SOFTWARE WEAKNESSES- Cybersecurity vulnerabilities Weaknesses: Classifying vulnerabilitiesWhile we define a vulnerability in terms of a weakness, it is hard to define a weakness itself. As different vulnerabilities may be associated with the same weakness type, we could look at a weakness type as a class and a vulnerability as an instance of that class. Although it is uncommon, a single vulnerability could be associated with two or more weaknesses exploited sequentially or in parallel. In that sense, a vulnerability is a set with one or more instances of weaknesses.
Similarity Score: 0.7220073

Result 2:
Text: A DECADE OF REOCCURRING SOFTWARE WEAKNESSES- Cybersecurity vulnerabilitiesWe can define a vulnerability as a weakness, in the security of a system, that can be exploited. The Common Vulnerabilities and Exposures (CVE) is a large set of publicly disclosed v

In [None]:
# Close the client connection
client.close()