In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('cleaned_db_text.csv')

In [5]:
def preprocess_sample(text):
    text = text.replace("\n", "")
    text = text.replace("\x91", "'")
    text = text.replace("\x92", "'")
    text = text.replace("\x93", "\"")
    text = text.replace("\x94", "\"")
    text = text.replace("\x96", "-")
    text = text.replace("\x97", "-")
    return text

# Preprocess the input question
input_question = "What is a software vulnerability?"
input_question = preprocess_sample(input_question)

# Preprocess all text in the DataFrame
df['text'] = df['text'].apply(preprocess_sample)

# Create tuples of (index, text) for each row in the DataFrame
text_tuples = list(zip(df.index, df['text']))

# Tokenize and vectorize the text data
vectorizer = CountVectorizer()
text_matrix = vectorizer.fit_transform(df['text'])

# Tokenize and vectorize the input question
input_question_vector = vectorizer.transform([input_question])

# Calculate cosine similarity
similarity_scores = cosine_similarity(input_question_vector, text_matrix)

# Find the most similar index based on cosine similarity using tuples
most_similar_tuple_index = similarity_scores.argmax()
most_similar_text_tuple = text_tuples[most_similar_tuple_index]
most_similar_score = similarity_scores[0, most_similar_tuple_index]

print(f"Input Question: {input_question}")
print(f"Most Similar Text Index: {most_similar_text_tuple[0]}")
print(f"Most Similar Text: {most_similar_text_tuple[1]}")
print(f"Similarity Score: {most_similar_score:.4f}")

Input Question: What is a software vulnerability?
Most Similar Text Index: 48
Most Similar Text: Bug, Fault, Error, or Weakness: Demystifying Software Security Vulnerabilities -DEFINITIONSWe can observe that a security vulnerability leads to a security failure. However, what are the buildingblocks of a vulnerability? What is the defect in software that triggers a vulnerability? How does it propagate through errors until a final, exploitable error is reached? How do the underlying weaknesses of a vulnerability relate to these propagating errors?
Similarity Score: 0.4975
