In [51]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import re
import itertools
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Load dataset (can be downloaded from Kaggle)
df = pd.read_csv("questions.csv")
df = df[['question1', 'question2', 'is_duplicate']]

# Drop rows with missing values
df = df.dropna(subset=['question1', 'question2', 'is_duplicate'])

# Use only a small sample for testing (optional)
df = df.sample(1000, random_state=42).reset_index(drop=True)

df.head()


Unnamed: 0,question1,question2,is_duplicate
0,Do people realize that you can send marijuana ...,How do you send weed through the mail?,0
1,How can rock music be brought back?,What would it take for rock music to make a co...,1
2,Why does one feel relaxed after smoking a join...,How do I sober up quickly after smoking weed/m...,0
3,How to gain weight ?,How do I gain weight fast but still be healthy?,1
4,Is porn bad for men?,Can I become a porn fan without getting addicted?,0


In [13]:
print(df.columns)



Index(['question1', 'question2', 'is_duplicate'], dtype='object')


In [15]:
# Load pretrained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the sentence pairs
embeddings1 = model.encode(df['question1'].tolist(), convert_to_tensor=True)
embeddings2 = model.encode(df['question2'].tolist(), convert_to_tensor=True)


In [45]:
# Calculate cosine similarity
cosine_scores = cosine_similarity(embeddings1.cpu().numpy(), embeddings2.cpu().numpy())
similarity_diag = np.diag(similarity_scores)


# Predict: If similarity > threshold → duplicate
threshold = 0.8
predictions = (similarity_scores > threshold).astype(int)

# Actual labels
labels = df['is_duplicate'].values


In [47]:
print("Accuracy:", accuracy_score(labels, predictions))
print("F1 Score:", f1_score(labels, predictions))
print("Confusion Matrix:\n", confusion_matrix(labels, predictions))


Accuracy: 0.79
F1 Score: 0.725130890052356
Confusion Matrix:
 [[513 121]
 [ 89 277]]


In [53]:
paragraph = input()

sentences = sent_tokenize(paragraph)
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
sim_matrix = cosine_similarity(sentence_embeddings.cpu().numpy())

print("\n\U00002705 Duplicate Sentence Pairs in Paragraph:\n")
visited = set()
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        if sim_matrix[i][j] > threshold and (j, i) not in visited:
            visited.add((i, j))
            print(f"- Sentence 1: {sentences[i]}")
            print(f"  Sentence 2: {sentences[j]}")
            print(f"  Similarity Score: {sim_matrix[i][j]:.2f}\n")

 I love going to the beach during summer. The ocean breeze is very refreshing.  I enjoy spending my holidays at the seaside. The fresh air from the ocean feels amazing.  I like watching movies on weekends. Weekends are great for enjoying films.



✅ Duplicate Sentence Pairs in Paragraph:

- Sentence 1: The ocean breeze is very refreshing.
  Sentence 2: The fresh air from the ocean feels amazing.
  Similarity Score: 0.82

- Sentence 1: I like watching movies on weekends.
  Sentence 2: Weekends are great for enjoying films.
  Similarity Score: 0.85

