In [4]:
# Paraphrase Identification using Scikit-Learn
# This script demonstrates a basic implementation of the paraphrase identification task.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import numpy as np

# --- Download NLTK data (only needs to be done once) ---
# The NLTK downloader will check if the packages are already present and up-to-date.
try:
    print("Checking for NLTK packages...")
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('tokenizers/punkt_tab') # Check for punkt_tab as well
    print("NLTK packages are already up-to-date.")
except LookupError:
    print("Downloading necessary NLTK packages...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('punkt_tab') # Download punkt_tab
    print("NLTK packages downloaded successfully.")


# --- 1. Data Loading and Preprocessing ---

def preprocess_text(text):
    """
    Preprocesses a single sentence:
    - Tokenizes
    - Converts to lowercase
    - Removes punctuation
    - Removes stopwords
    - Lemmatizes
    """
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)

    # Tokenize
    tokens = word_tokenize(text.lower())

    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

def load_and_preprocess_data(filepath):
    """Loads the MRPC dataset and applies preprocessing."""
    # The 'on_bad_lines' parameter is used to skip rows that have parsing errors.
    df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip', quoting=3) # quoting=3 ignores quotes

    # Ensure the correct columns are being selected after loading
    # The MRPC dataset format is: Quality #1 ID #2 ID #1 String #2 String
    df = df.iloc[:, [0, 3, 4]] # Select columns by index
    df.columns = ['label', 'sentence1', 'sentence2']

    print("Preprocessing sentence 1...")
    df['sentence1_processed'] = df['sentence1'].astype(str).apply(preprocess_text)
    print("Preprocessing sentence 2...")
    df['sentence2_processed'] = df['sentence2'].astype(str).apply(preprocess_text)

    return df

# A common place to find it is: https://www.microsoft.com/en-us/download/details.aspx?id=52398

try:
    # Try loading the real dataset
    data_path = '/content/msr_paraphrase_train.txt'
    df = load_and_preprocess_data(data_path)
    print("MRPC dataset loaded successfully.")
except FileNotFoundError:
    print("MRPC dataset not found. Creating a dummy dataset for demonstration.")
    dummy_data = {
        'label': [1, 0, 1, 0, 1],
        'sentence1': [
            "The cat sat on the mat.",
            "The dog played in the park.",
            "What is the weather like today?",
            "I love to eat pizza.",
            "The company is located in New York."
        ],
        'sentence2': [
            "On the mat, the cat sat.",
            "The sun is shining brightly.",
            "How is the weather today?",
            "I enjoy eating pasta.",
            "The firm's headquarters are in New York City."
        ]
    }
    df = pd.DataFrame(dummy_data)
    df['sentence1_processed'] = df['sentence1'].apply(preprocess_text)
    df['sentence2_processed'] = df['sentence2'].apply(preprocess_text)


# --- 2. Feature Engineering ---

# Initialize TF-IDF Vectorizer
# We will fit it on both sentence columns to build a comprehensive vocabulary.
corpus = pd.concat([df['sentence1_processed'], df['sentence2_processed']]).unique()
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

# Transform the sentences into TF-IDF vectors
X1 = vectorizer.transform(df['sentence1_processed'])
X2 = vectorizer.transform(df['sentence2_processed'])

# Combine the features for each sentence pair
# We'll simply concatenate the vectors
X = np.hstack((X1.toarray(), X2.toarray()))
y = df['label']


# --- 3. Model Training ---

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if len(y.unique()) > 1 else None)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize and train the Logistic Regression model
model = LogisticRegression(solver='liblinear')
print("\nTraining the model...")
model.fit(X_train, y_train)
print("Model training complete.")


# --- 4. Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# --- 5. Example Prediction ---

def predict_paraphrase(sentence1, sentence2):
    """
    Takes two sentences and predicts if they are a paraphrase.
    """
    # Preprocess the input sentences
    s1_processed = preprocess_text(sentence1)
    s2_processed = preprocess_text(sentence2)

    # Vectorize the processed sentences
    v1 = vectorizer.transform([s1_processed])
    v2 = vectorizer.transform([s2_processed])

    # Combine the vectors
    combined_vector = np.hstack((v1.toarray(), v2.toarray()))

    # Make a prediction
    prediction = model.predict(combined_vector)
    probability = model.predict_proba(combined_vector)

    if prediction[0] == 1:
        return f"Result: Paraphrase (Confidence: {probability[0][1]:.2f})"
    else:
        return f"Result: Not a Paraphrase (Confidence: {probability[0][0]:.2f})"

# Example Usage
print("\n--- Testing with new sentences ---")
test_sentence_1 = "The government has announced new tax policies."
test_sentence_2 = "New taxation policies were revealed by the government."
print(f"Sentence 1: '{test_sentence_1}'")
print(f"Sentence 2: '{test_sentence_2}'")
print(predict_paraphrase(test_sentence_1, test_sentence_2))

print("\n")

test_sentence_3 = "The birds are singing in the trees."
test_sentence_4 = "I need to buy groceries from the store."
print(f"Sentence 1: '{test_sentence_3}'")
print(f"Sentence 2: '{test_sentence_4}'")
print(predict_paraphrase(test_sentence_3, test_sentence_4))

Checking for NLTK packages...
Downloading necessary NLTK packages...
NLTK packages downloaded successfully.
Preprocessing sentence 1...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Preprocessing sentence 2...
MRPC dataset loaded successfully.

Training data shape: (3260, 21272)
Testing data shape: (816, 21272)

Training the model...
Model training complete.

Model Accuracy: 0.7022

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.19      0.29       265
           1       0.71      0.95      0.81       551

    accuracy                           0.70       816
   macro avg       0.67      0.57      0.55       816
weighted avg       0.69      0.70      0.64       816


Confusion Matrix:
[[ 50 215]
 [ 28 523]]

--- Testing with new sentences ---
Sentence 1: 'The government has announced new tax policies.'
Sentence 2: 'New taxation policies were revealed by the government.'
Result: Paraphrase (Confidence: 0.81)


Sentence 1: 'The birds are singing in the trees.'
Sentence 2: 'I need to buy groceries from the store.'
Result: Paraphrase (Confidence: 0.60)


In [7]:
# Paraphrase Identification using a Siamese LSTM Network with GloVe Embeddings

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import requests
import zipfile
import os

# --- TensorFlow and Keras Imports ---
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras import backend as K

# --- Download NLTK data ---
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading necessary NLTK packages...")
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("NLTK packages are ready.")

# --- 1. Download and Load GloVe Word Embeddings ---
def download_and_load_glove(glove_file="glove.6B.100d.txt", zip_file="glove.6B.zip", url="http://nlp.stanford.edu/data/glove.6B.zip"):
    """
    Downloads, unzips, and loads GloVe embeddings if they don't exist.
    """
    # Check if the GloVe text file already exists
    if not os.path.exists(glove_file):
        print(f"'{glove_file}' not found.")
        # Check if the zip file exists
        if not os.path.exists(zip_file):
            print(f"Downloading GloVe embeddings from {url}...")
            try:
                response = requests.get(url, stream=True)
                response.raise_for_status() # Raise an exception for bad status codes
                with open(zip_file, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print("Download complete.")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading file: {e}")
                return None

        # Unzip the file
        print(f"Unzipping '{zip_file}'...")
        try:
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall()
            print(f"Successfully unzipped. Extracted '{glove_file}'.")
        except zipfile.BadZipFile:
            print(f"Error: '{zip_file}' is not a valid zip file. Please delete it and try again.")
            return None

    # Load the embeddings from the text file
    print("Loading GloVe Embeddings...")
    embeddings_dict = {}
    try:
        with open(glove_file, 'r', encoding="utf-8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        print(f"Loaded {len(embeddings_dict)} word vectors.")
        return embeddings_dict
    except FileNotFoundError:
        print(f"Critical Error: GloVe file '{glove_file}' could not be found after download/unzip attempt.")
        return None

glove_embeddings = download_and_load_glove()

# --- 2. Data Loading and Preprocessing ---
def load_data(filepath):
    """Loads and cleans the MRPC dataset."""
    df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip', quoting=3)
    df = df.iloc[:, [0, 3, 4]]
    df.columns = ['label', 'sentence1', 'sentence2']
    # Drop rows with missing values
    df.dropna(inplace=True)
    return df

try:
    data_path = 'msr-paraphrase-train.txt'
    df = load_data(data_path)
    print("MRPC dataset loaded successfully.")
except FileNotFoundError:
    print("MRPC dataset not found. Using a dummy dataset for demonstration.")
    dummy_data = {
        'label': [1, 0, 1, 0, 1],
        'sentence1': ["The cat sat on the mat.", "The dog played in the park.", "What is the weather like today?", "I love to eat pizza.", "The company is located in New York."],
        'sentence2': ["On the mat, the cat sat.", "The sun is shining brightly.", "How is the weather today?", "I enjoy eating pasta.", "The firm's headquarters are in New York City."]
    }
    df = pd.DataFrame(dummy_data)

# --- 3. Data Preparation for Keras ---
if glove_embeddings:
    # Combine all sentences for tokenizer vocabulary
    all_sentences = pd.concat([df['sentence1'], df['sentence2']]).astype(str)

    # Initialize and fit tokenizer
    tokenizer = Tokenizer(num_words=10000, oov_token='<unk>')
    tokenizer.fit_on_texts(all_sentences)
    word_index = tokenizer.word_index
    print(f"Found {len(word_index)} unique tokens.")

    # Convert sentences to sequences of integers
    seq1 = tokenizer.texts_to_sequences(df['sentence1'].astype(str))
    seq2 = tokenizer.texts_to_sequences(df['sentence2'].astype(str))

    # Pad sequences to a max length
    MAX_SEQUENCE_LENGTH = 30
    data1 = pad_sequences(seq1, maxlen=MAX_SEQUENCE_LENGTH)
    data2 = pad_sequences(seq2, maxlen=MAX_SEQUENCE_LENGTH)
    labels = df['label'].values

    # --- 4. Create GloVe Embedding Matrix ---
    EMBEDDING_DIM = 100 # Must match the GloVe file dimension
    num_words = min(10000, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # --- 5. Build the Siamese LSTM Model ---

    # Define the shared layers
    embedding_layer = Embedding(
        num_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False # We don't want to train the pre-trained GloVe embeddings
    )

    lstm_layer = LSTM(64) # 64 is the number of LSTM units

    # Define the model inputs
    input1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    input2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

    # Define the two towers of the Siamese network
    tower1 = embedding_layer(input1)
    tower1 = lstm_layer(tower1)

    tower2 = embedding_layer(input2)
    tower2 = lstm_layer(tower2)

    # Define the distance function (Manhattan distance)
    def manhattan_distance(vectors):
        vec1, vec2 = vectors
        return K.exp(-K.sum(K.abs(vec1 - vec2), axis=1, keepdims=True))

    distance = Lambda(manhattan_distance)([tower1, tower2])

    # The final model
    model = Model(inputs=[input1, input2], outputs=distance)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    # --- 6. Train the Model ---

    # Split data
    indices = np.arange(data1.shape[0])
    np.random.shuffle(indices)
    data1 = data1[indices]
    data2 = data2[indices]
    labels = labels[indices]

    num_validation_samples = int(0.2 * data1.shape[0])

    x1_train = data1[:-num_validation_samples]
    x2_train = data2[:-num_validation_samples]
    y_train = labels[:-num_validation_samples]

    x1_val = data1[-num_validation_samples:]
    x2_val = data2[-num_validation_samples:]
    y_val = labels[-num_validation_samples:]

    print("\nTraining the Siamese LSTM model...")
    history = model.fit(
        [x1_train, x2_train], y_train,
        validation_data=([x1_val, x2_val], y_val),
        epochs=10,
        batch_size=64
    )
    print("Model training complete.")

    # --- 7. Evaluation ---

    print("\nEvaluating the model on the validation set...")
    # Predict probabilities
    y_pred_probs = model.predict([x1_val, x2_val])
    # Convert probabilities to binary predictions (0 or 1)
    y_pred = (y_pred_probs > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred)
    print(f"\nModel Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_pred))



'glove.6B.100d.txt' not found.
Downloading GloVe embeddings from http://nlp.stanford.edu/data/glove.6B.zip...
Download complete.
Unzipping 'glove.6B.zip'...
Successfully unzipped. Extracted 'glove.6B.100d.txt'.
Loading GloVe Embeddings...
Loaded 400000 word vectors.
MRPC dataset not found. Using a dummy dataset for demonstration.
Found 35 unique tokens.





Training the Siamese LSTM model...
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.2500 - loss: 2.6187 - val_accuracy: 1.0000 - val_loss: 0.0023
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.2500 - loss: 2.1874 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 0.2500 - loss: 1.8065 - val_accuracy: 1.0000 - val_loss: 0.0026
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.2500 - loss: 1.4967 - val_accuracy: 1.0000 - val_loss: 0.0027
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.2500 - loss: 1.2607 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.2500 - loss: 1.0753 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 7/



In [11]:
# Paraphrase Identification using a Siamese LSTM Network with GloVe Embeddings
# This script builds, trains, and evaluates a deep learning model on the full MRPC dataset.

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import requests
import zipfile
import os

# --- TensorFlow and Keras Imports ---
# You will need to run: pip install tensorflow requests
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras import backend as K

# --- Download NLTK data ---
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading necessary NLTK packages...")
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("NLTK packages are ready.")

# --- 1. Download and Load GloVe Word Embeddings ---
def download_and_load_glove(glove_file="glove.6B.100d.txt", zip_file="glove.6B.zip", url="http://nlp.stanford.edu/data/glove.6B.zip"):
    """
    Downloads, unzips, and loads GloVe embeddings if they don't exist.
    """
    if not os.path.exists(glove_file):
        print(f"'{glove_file}' not found.")
        if not os.path.exists(zip_file):
            print(f"Downloading GloVe embeddings from {url}...")
            try:
                response = requests.get(url, stream=True)
                response.raise_for_status()
                with open(zip_file, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print("Download complete.")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading file: {e}")
                return None

        print(f"Unzipping '{zip_file}'...")
        try:
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall()
            print(f"Successfully unzipped. Extracted '{glove_file}'.")
        except zipfile.BadZipFile:
            print(f"Error: '{zip_file}' is not a valid zip file. Please delete it and try again.")
            return None

    print("Loading GloVe Embeddings...")
    embeddings_dict = {}
    try:
        with open(glove_file, 'r', encoding="utf-8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        print(f"Loaded {len(embeddings_dict)} word vectors.")
        return embeddings_dict
    except FileNotFoundError:
        print(f"Critical Error: GloVe file '{glove_file}' could not be found.")
        return None

glove_embeddings = download_and_load_glove()

# --- 2. Data Loading and Preprocessing ---
def load_data(filepath, file_type):
    """Loads and cleans the MRPC dataset."""
    try:
        df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip', quoting=3)
        # The test file has a different header, so we skip the first row
        if file_type == 'test':
            df = df.iloc[1:]
        df = df.iloc[:, [0, 3, 4]]
        df.columns = ['label', 'sentence1', 'sentence2']
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
        df.dropna(inplace=True)
        df['label'] = df['label'].astype(int)
        print(f"Successfully loaded {filepath}")
        return df
    except FileNotFoundError:
        print(f"Error: {filepath} not found. Please ensure it's in the correct directory.")
        return None

# Load both training and testing datasets
train_df = load_data('/content/msr_paraphrase_test.txt', 'train')
test_df = load_data('/content/msr_paraphrase_test.txt', 'test')

# --- 3. Data Preparation for Keras ---
if glove_embeddings is not None and train_df is not None and test_df is not None:
    # Fit tokenizer ONLY on the training data to prevent data leakage
    train_sentences = pd.concat([train_df['sentence1'], train_df['sentence2']]).astype(str)

    tokenizer = Tokenizer(num_words=15000, oov_token='<unk>')
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index
    print(f"Found {len(word_index)} unique tokens in training data.")

    # Prepare function to transform text to padded sequences
    MAX_SEQUENCE_LENGTH = 35
    def prepare_sequences(df, tokenizer):
        seq1 = tokenizer.texts_to_sequences(df['sentence1'].astype(str))
        seq2 = tokenizer.texts_to_sequences(df['sentence2'].astype(str))
        data1 = pad_sequences(seq1, maxlen=MAX_SEQUENCE_LENGTH)
        data2 = pad_sequences(seq2, maxlen=MAX_SEQUENCE_LENGTH)
        labels = df['label'].values
        return data1, data2, labels

    # Prepare train, validation, and test sets
    x1_train, x2_train, y_train = prepare_sequences(train_df, tokenizer)
    x1_test, x2_test, y_test = prepare_sequences(test_df, tokenizer)

    # --- 4. Create GloVe Embedding Matrix ---
    EMBEDDING_DIM = 100
    num_words = min(15000, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # --- 5. Build the Siamese LSTM Model ---
    embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH, trainable=False)
    lstm_layer = LSTM(64)
    input1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    input2 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    tower1 = lstm_layer(embedding_layer(input1))
    tower2 = lstm_layer(embedding_layer(input2))

    def manhattan_distance(vectors):
        vec1, vec2 = vectors
        return K.exp(-K.sum(K.abs(vec1 - vec2), axis=1, keepdims=True))

    distance = Lambda(manhattan_distance)([tower1, tower2])
    model = Model(inputs=[input1, input2], outputs=distance)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    # --- 6. Train the Model ---
    print("\nTraining the Siamese LSTM model...")
    history = model.fit(
        [x1_train, x2_train], y_train,
        validation_split=0.1, # Use 10% of training data for validation
        epochs=75, # Increased epochs for a larger dataset
        batch_size=64
    )
    print("Model training complete.")

    # --- 7. Final Evaluation on the Unseen Test Set ---
    print("\nEvaluating the model on the unseen test set...")
    y_pred_probs = model.predict([x1_test, x2_test])
    y_pred = (y_pred_probs > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nFinal Test Accuracy: {accuracy:.4f}")
    print("\nFinal Test Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("\nFinal Test Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

else:
    print("\nSkipping model training due to missing data or embeddings.")



Loading GloVe Embeddings...
Loaded 400000 word vectors.
Successfully loaded /content/msr_paraphrase_test.txt
Successfully loaded /content/msr_paraphrase_test.txt
Found 8842 unique tokens in training data.





Training the Siamese LSTM model...
Epoch 1/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.4229 - loss: 2.3829 - val_accuracy: 0.4913 - val_loss: 1.4808
Epoch 2/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5243 - loss: 1.2510 - val_accuracy: 0.5145 - val_loss: 1.1600
Epoch 3/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.5398 - loss: 0.9734 - val_accuracy: 0.5260 - val_loss: 1.0087
Epoch 4/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5451 - loss: 0.8366 - val_accuracy: 0.5318 - val_loss: 0.9058
Epoch 5/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5717 - loss: 0.7224 - val_accuracy: 0.5491 - val_loss: 0.8296
Epoch 6/75
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6204 - loss: 0.6309 - val_accuracy: 0.5491 - val_loss: 0.774