In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
import time

In [6]:
# Initialize BERT models
model1 = SentenceTransformer('paraphrase-MiniLM-L3-v2')
model2 = SentenceTransformer('all-distilroberta-v1')
model3 = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [9]:
def model(user_ques_1,user_ques_2):
    # Stop word removal
    nltk.download('stopwords')
    stop = stopwords.words('english')
    # Handle missing values by replacing NaN with an empty string
    ques_1_nstop = ' '.join([word for word in user_ques_1.split() if word not in stop])
    ques_2_nstop = ' '.join([word for word in user_ques_2.split() if word not in stop])

    # Tokenize the questions
    nltk.download('punkt')
    tok_ques_1 = nltk.word_tokenize(ques_1_nstop)
    tok_ques_2 = nltk.word_tokenize(ques_2_nstop)
    # Convert obtained lists result from tokenizer to string data points
    tok_ques_1_str = ' '.join(map(str, tok_ques_1))
    tok_ques_2_str = ' '.join(map(str, tok_ques_2))
    # Lemmatize data
    nltk.download('wordnet')
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    def lemmatize_text(text):
        return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

    ques_1_lemm_str = lemmatize_text(tok_ques_1_str)
    ques_2_lemm_str = lemmatize_text(tok_ques_2_str)
    # Convert lemmatized data to lower case
    ques_1_lemm_str = ques_1_lemm_str.lower()
    ques_2_lemm_str = ques_2_lemm_str.lower()
    # Text similarity scores obtained using 'paraphrase-MiniLM-L3-v2' BERT model
    st = time.time()
    embd1 = model1.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model1.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores1 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st
#    print("Similarity Score using BERT model 'paraphrase-MiniLM-L3-v2':", cosine_scores1.item())
#   print('Execution time:', elapsed_time, 'seconds')
    # Text similarity scores obtained using BERT model 'all-distilroberta-v1'
    st = time.time()
    embd1 = model2.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model2.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores2 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st
#     print("Similarity Score using BERT model 'all-distilroberta-v1':", cosine_scores2.item())
#     print('Execution time:', elapsed_time, 'seconds')
    # Text similarity scores obtained using BERT model 'multi-qa-distilbert-cos-v1'
    st = time.time()
    embd1 = model3.encode(ques_1_lemm_str, convert_to_tensor=True)
    embd2 = model3.encode(ques_2_lemm_str, convert_to_tensor=True)
    cosine_scores3 = util.pytorch_cos_sim(embd1, embd2)
    et = time.time()
    elapsed_time = et - st
#     print("Similarity Score using BERT model 'multi-qa-distilbert-cos-v1':", cosine_scores3.item())
#     print('Execution time:', elapsed_time, 'seconds')
    # Calculate average similarity score
    avg_similarity_score = (cosine_scores1.item() + cosine_scores2.item() + cosine_scores3.item())/3
    return avg_similarity_score


In [10]:
# Step 1: Get user input for two questions
user_ques_1 = input("Enter Question 1: ")
user_ques_2 = input("Enter Question 2: ")
print(model(user_ques_1,user_ques_2))

Enter Question 1: test
Enter Question 2: test
1.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
# Stop word removal
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Handle missing values by replacing NaN with an empty string
ques_1_nstop = ' '.join([word for word in user_ques_1.split() if word not in stop])
ques_2_nstop = ' '.join([word for word in user_ques_2.split() if word not in stop])

# Tokenize the questions
nltk.download('punkt')
tok_ques_1 = nltk.word_tokenize(ques_1_nstop)
tok_ques_2 = nltk.word_tokenize(ques_2_nstop)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# Convert obtained lists result from tokenizer to string data points
tok_ques_1_str = ' '.join(map(str, tok_ques_1))
tok_ques_2_str = ' '.join(map(str, tok_ques_2))



In [22]:
# Lemmatize data
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

ques_1_lemm_str = lemmatize_text(tok_ques_1_str)
ques_2_lemm_str = lemmatize_text(tok_ques_2_str)



In [24]:
# Convert lemmatized data to lower case
ques_1_lemm_str = ques_1_lemm_str.lower()
ques_2_lemm_str = ques_2_lemm_str.lower()



In [25]:
# Text similarity scores obtained using 'paraphrase-MiniLM-L3-v2' BERT model
st = time.time()
embd1 = model1.encode(ques_1_lemm_str, convert_to_tensor=True)
embd2 = model1.encode(ques_2_lemm_str, convert_to_tensor=True)
cosine_scores1 = util.pytorch_cos_sim(embd1, embd2)
et = time.time()
elapsed_time = et - st
print("Similarity Score using BERT model 'paraphrase-MiniLM-L3-v2':", cosine_scores1.item())
print('Execution time:', elapsed_time, 'seconds')



Similarity Score using BERT model 'paraphrase-MiniLM-L3-v2': 0.935845673084259
Execution time: 0.07124948501586914 seconds


In [26]:
# Text similarity scores obtained using BERT model 'all-distilroberta-v1'
st = time.time()
embd1 = model2.encode(ques_1_lemm_str, convert_to_tensor=True)
embd2 = model2.encode(ques_2_lemm_str, convert_to_tensor=True)
cosine_scores2 = util.pytorch_cos_sim(embd1, embd2)
et = time.time()
elapsed_time = et - st
print("Similarity Score using BERT model 'all-distilroberta-v1':", cosine_scores2.item())
print('Execution time:', elapsed_time, 'seconds')



Similarity Score using BERT model 'all-distilroberta-v1': 0.8731243014335632
Execution time: 0.08671975135803223 seconds


In [31]:
# Text similarity scores obtained using BERT model 'multi-qa-distilbert-cos-v1'
st = time.time()
embd1 = model3.encode(ques_1_lemm_str, convert_to_tensor=True)
embd2 = model3.encode(ques_2_lemm_str, convert_to_tensor=True)
cosine_scores3 = util.pytorch_cos_sim(embd1, embd2)
et = time.time()
elapsed_time = et - st
print("Similarity Score using BERT model 'multi-qa-distilbert-cos-v1':", cosine_scores3.item())
print('Execution time:', elapsed_time, 'seconds')

Similarity Score using BERT model 'multi-qa-distilbert-cos-v1': 0.8531126976013184
Execution time: 0.0839846134185791 seconds


In [33]:
# Calculate average similarity score
avg_similarity_score = (cosine_scores1.item() + cosine_scores2.item() + cosine_scores3.item())/3
print("Average Similarity Score:", avg_similarity_score)

Average Similarity Score: 0.8873608907063802


In [12]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.0.6-py3-none-any.whl (1.9 MB)
     ---------------------------------------- 1.9/1.9 MB 10.3 MB/s eta 0:00:00
Installing collected packages: reportlab
Successfully installed reportlab-4.0.6


In [13]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Create a PDF document
pdf_file = "output.pdf"
c = canvas.Canvas(pdf_file, pagesize=letter)

# Set font and other attributes
c.setFont("Helvetica", 12)
line_height = 14  # Adjust line height as needed

# Iterate through the list and add the content to the PDF
y_position = letter[1] - 72  # Start position (72 points from the top)
for content in content_list:
    c.drawString(72, y_position, content)
    y_position -= line_height

# Save the PDF document
c.save()
