In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from scipy.spatial.distance import cosine

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load SBERT model (better version)
model = SentenceTransformer('all-mpnet-base-v2')



In [3]:
# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    words = word_tokenize(text)  # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & Stopword removal
    return ' '.join(words)



In [4]:
# Load datasets
train = pd.read_csv('labeled_final_train.csv')  # Adjust filename if needed
test = pd.read_csv('labeled_final_test.csv')

# Preprocess text
train['sentence1'] = train['sentence1'].apply(preprocess_text)
train['sentence2'] = train['sentence2'].apply(preprocess_text)
test['sentence1'] = test['sentence1'].apply(preprocess_text)
test['sentence2'] = test['sentence2'].apply(preprocess_text)

In [None]:


# Encode text into SBERT embeddings
train['sentence1_embedding'] = train['sentence1'].apply(lambda x: model.encode(x, convert_to_numpy=True))
train['sentence2_embedding'] = train['sentence2'].apply(lambda x: model.encode(x, convert_to_numpy=True))
test['sentence1_embedding'] = test['sentence1'].apply(lambda x: model.encode(x, convert_to_numpy=True))
test['sentence2_embedding'] = test['sentence2'].apply(lambda x: model.encode(x, convert_to_numpy=True))


In [None]:

# Convert embeddings to NumPy arrays
train_embeddings1 = np.vstack(train['sentence1_embedding'].values)
train_embeddings2 = np.vstack(train['sentence2_embedding'].values)
test_embeddings1 = np.vstack(test['sentence1_embedding'].values)
test_embeddings2 = np.vstack(test['sentence2_embedding'].values)

In [None]:
# Compute cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Compute extra features
def sentence_length_diff(sent1, sent2):
    return abs(len(sent1) - len(sent2))

def jaccard_similarity(sent1, sent2):
    words1, words2 = set(sent1.split()), set(sent2.split())
    return len(words1 & words2) / len(words1 | words2) if words1 | words2 else 0


In [None]:

# Compute features
train['similarity'] = [cosine_similarity(train_embeddings1[i], train_embeddings2[i]) for i in range(len(train))]
train['length_diff'] = train.apply(lambda row: sentence_length_diff(row['sentence1'], row['sentence2']), axis=1)
train['jaccard'] = train.apply(lambda row: jaccard_similarity(row['sentence1'], row['sentence2']), axis=1)

test['similarity'] = [cosine_similarity(test_embeddings1[i], test_embeddings2[i]) for i in range(len(test))]
test['length_diff'] = test.apply(lambda row: sentence_length_diff(row['sentence1'], row['sentence2']), axis=1)
test['jaccard'] = test.apply(lambda row: jaccard_similarity(row['sentence1'], row['sentence2']), axis=1)


In [None]:
# Plot similarity distribution
plt.figure(figsize=(8,5))
sns.histplot(train['similarity'], bins=30, kde=True, color='blue')
plt.title('Cosine Similarity Distribution')
plt.xlabel('Cosine Similarity')
plt.ylabel('Density')
plt.show()

0

In [None]:
# Prepare features and labels
X_train = train[['similarity', 'length_diff', 'jaccard']].values
y_train = train['label'].values
X_test = test[['similarity', 'length_diff', 'jaccard']].values
y_test = test['label'].values

# Train Random Forest classifier
classifier = RandomForestClassifier(n_estimators=200, random_state=42)
classifier.fit(X_train, y_train)

# Feature importance plot
feature_names = ['Cosine Similarity', 'Length Difference', 'Jaccard Similarity']
feature_importances = classifier.feature_importances_
plt.figure(figsize=(7,5))
sns.barplot(x=feature_importances, y=feature_names, palette="Blues_r")
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.show()




In [None]:

# Predict
y_pred = classifier.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Not Paraphrase', 'Paraphrase'], yticklabels=['Not Paraphrase', 'Paraphrase'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()