In [13]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.metrics import confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import LoggingHandler
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import logging
import torch

# Load the sentence-transformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


# Set up logging
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, handlers=[LoggingHandler()])


2025-02-22 14:18:55,422 - Use pytorch device_name: mps
2025-02-22 14:18:55,422 - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


In [8]:
# Download latest version
path = kagglehub.dataset_download("doctri/microsoft-research-paraphrase-corpus")

train_dataset_path = path + "/msr_paraphrase_train.txt"
test_dataset_path = path + "/msr_paraphrase_test.txt"

# Load the dataset
train_df = pd.read_csv(train_dataset_path, sep='\t', header=0, on_bad_lines='skip')
test_df = pd.read_csv("complete_synthetic.csv", sep=',', header=0, on_bad_lines='skip', encoding='latin1')

In [9]:
# Keep only relevant columns
train_df = train_df[["Quality", "#1 String", "#2 String"]]
test_df = test_df[["Quality", "#1 String", "#2 String"]]

# Ensure no missing values
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Convert labels to integers (1 = paraphrase, 0 = not paraphrase)
train_df["Quality"] = train_df["Quality"].astype(int)
test_df["Quality"] = test_df["Quality"].astype(int)

In [8]:
# Create InputExamples for training
train_examples = [InputExample(texts=[row["#1 String"], row["#2 String"]], label=row["Quality"]) for _, row in train_df.iterrows()]

# Create a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)


In [9]:
# Fine-tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, warmup_steps=100)

                                                                     

Step,Training Loss


In [10]:
# Create InputExamples for the entire test dataset
test_examples = [InputExample(texts=[row["#1 String"], row["#2 String"]], label=row["Quality"]) for _, row in test_df.iterrows()]

# Extract sentences and labels from InputExamples
sentences1 = [example.texts[0] for example in test_examples]
sentences2 = [example.texts[1] for example in test_examples]
labels = [example.label for example in test_examples]

# Generate embeddings for the entire dataset
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine similarity for all pairs
cosine_similarities = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).cpu().numpy()

# Convert similarities to predictions (adjust threshold as needed)
predictions = (cosine_similarities >= 0.65).astype(int)

# Convert lists to numpy arrays for metric calculations
y_true = np.array(labels)
y_pred = predictions  # predictions is already a numpy array

Batches: 100%|██████████| 5/5 [00:00<00:00, 13.41it/s]
Batches: 100%|██████████| 5/5 [00:00<00:00, 21.32it/s]


In [12]:
# Accuracy
cm = confusion_matrix(y_true, y_pred)

plt.figure()
plt.matshow(cm, cmap='Blues')
plt.title("Confusion matrix")
plt.xlabel("Predicted")
plt.ylabel("True")

# Add numbers in the confusion matrix
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(x=j, y=i, s=cm[i, j], ha='center', va='center', color='black')

plt.colorbar()
plt.show()

NameError: name 'confusion_matrix' is not defined

In [None]:
report = classification_report(y_true, y_pred)
print(report)