In [1]:
!pip install sentence-transformers scikit-learn datasets jupyter



In [2]:
!pip install streamlit



In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import numpy as np

In [5]:
!pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.1.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-macosx_11_0_arm64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m520.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.1.0


In [6]:
dataset = load_dataset("glue", "mrpc")
train_data = dataset['train']
test_data = dataset['test']

In [7]:
def get_data(data):
    s1 = data['sentence1']
    s2 = data['sentence2']
    labels = data['label']
    return s1, s2, labels

train_s1, train_s2, train_labels = get_data(train_data)
test_s1, test_s2, test_labels = get_data(test_data)


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed sentence pairs
X_train = [
    np.concatenate((
        model.encode(train_data[i]['sentence1']),
        model.encode(train_data[i]['sentence2'])
    )) for i in range(len(train_data))
]

y_train = [label for label in train_data['label']]

# Train classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)


In [9]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
train_embeddings_1 = model.encode(train_s1, convert_to_tensor=False)
train_embeddings_2 = model.encode(train_s2, convert_to_tensor=False)

test_embeddings_1 = model.encode(test_s1, convert_to_tensor=False)
test_embeddings_2 = model.encode(test_s2, convert_to_tensor=False)

In [11]:
train_features = [cosine_similarity([e1], [e2])[0][0] for e1, e2 in zip(train_embeddings_1, train_embeddings_2)]
test_features = [cosine_similarity([e1], [e2])[0][0] for e1, e2 in zip(test_embeddings_1, test_embeddings_2)]

In [12]:
clf = LogisticRegression()
clf.fit(np.array(train_features).reshape(-1, 1), train_labels)

In [13]:
predictions = clf.predict(np.array(test_features).reshape(-1, 1))
accuracy = np.mean(predictions == test_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 73.80%


In [14]:
def is_paraphrase(sen1, sen2):
    e1 = model.encode([sen1])[0]
    e2 = model.encode([sen2])[0]
    sim = cosine_similarity([e1], [e2])[0][0]
    prediction = clf.predict([[sim]])[0]
    return prediction, sim

In [16]:
s1 = "He is driving a car."
s2 = "He is operating a vehicle."
result, score = is_paraphrase(s1, s2)
print("Prediction:", "Paraphrase" if result else "Not Paraphrase")
print("Similarity Score:", score)

Prediction: Not Paraphrase
Similarity Score: 0.6203252


In [19]:
import joblib

# Save the classifier
joblib.dump(clf, "paraphrase_classifier.pkl")


['paraphrase_classifier.pkl']

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import joblib

# Sample paraphrase data
sentences = [
    ("How are you?", "How do you do?", 1),
    ("What is your name?", "Who are you?", 1),
    ("What time is it?", "Where do you live?", 0),
    ("He is running", "He is jogging", 1),
    ("I love apples", "I dislike oranges", 0)
]

X = []
y = []

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

for s1, s2, label in sentences:
    emb1 = model.encode([s1])[0]
    emb2 = model.encode([s2])[0]
    sim = cosine_similarity([emb1], [emb2])[0][0]
    X.append([sim])
    y.append(label)

# Train a classifier
clf = LogisticRegression()
clf.fit(X, y)

# Save the model
joblib.dump(clf, "paraphrase_classifier.pkl")
print("✅ Model saved as paraphrase_classifier.pkl")


✅ Model saved as paraphrase_classifier.pkl
