In [2]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from datasets import load_dataset

# Define the input texts for the task
texts = [
    ("We process your personal data strictly in accordance with the applicable laws and regulations. Your data will only be used for the following lawful purposes: With Your Consent: We will process your personal data when you have provided your explicit consent for specific purposes. Your consent will be obtained in a clear and transparent manner, and you will have the option to withdraw it at any time.", 
     "We will ensure that you are fully informed about the specific personal data we intend to collect and the purpose for which it will be used. We will inform you of your rights, including how you can exercise these rights under the applicable sections of the law. This includes your right to access, correct, or delete your data, as well as any other rights you may have under the law."),
    
]

# Load and tokenize the dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

# Create a custom dataset from text pairs
def create_custom_dataset(text_pairs):
    inputs = tokenizer([pair[0] for pair in text_pairs], [pair[1] for pair in text_pairs], truncation=True, padding='max_length', max_length=128, return_tensors='tf')
    labels = [1.0] * len(text_pairs)  # Dummy labels for example; adjust based on your task
    dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), labels))
    return dataset

# Prepare the custom dataset
custom_dataset = create_custom_dataset(texts)

# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Use num_labels=1 for similarity scoring

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mean_squared_error'])

# Function to make predictions
def predict_similarity(text_pairs):
    inputs = tokenizer([pair[0] for pair in text_pairs], [pair[1] for pair in text_pairs], return_tensors='tf', padding=True, truncation=True, max_length=128)
    outputs = model(inputs)
    predictions = outputs.logits.numpy().flatten()
    return predictions

# Example usage
predicted_scores = predict_similarity(texts)
for i, (text1, text2) in enumerate(texts):
    print(f'Pair {i+1}:')
    print(f'Sentence 1: {text1}')
    print(f'Sentence 2: {text2}')
    print(f'Predicted Similarity Score: {predicted_scores[i]}')


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Pair 1:
Sentence 1: We process your personal data strictly in accordance with the applicable laws and regulations. Your data will only be used for the following lawful purposes: With Your Consent: We will process your personal data when you have provided your explicit consent for specific purposes. Your consent will be obtained in a clear and transparent manner, and you will have the option to withdraw it at any time.
Sentence 2: We will ensure that you are fully informed about the specific personal data we intend to collect and the purpose for which it will be used. We will inform you of your rights, including how you can exercise these rights under the applicable sections of the law. This includes your right to access, correct, or delete your data, as well as any other rights you may have under the law.
Predicted Similarity Score: 0.371888667345047
