In [None]:
!unzip ./raw_data.zip -d ./raw_data


In [None]:
# Define the mapping from category names to indices
LABEL_TO_NUMBER = {
    'First Party Collection/Use': 0,
    'Third Party Sharing/Collection': 1,
    'User Access, Edit and Deletion': 2,
    'Data Retention': 3,
    'Data Security': 4,
    'International and Specific Audiences': 5,
    'Do Not Track': 6,
    'Policy Change': 7,
    'User Choice/Control': 8,
    'Introductory/Generic': 9,
    'Practice not covered': 10,
    'Privacy contact information': 11,
    'Other': 12
}


In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset

class PrivacyPolicyDataset(Dataset):
    def __init__(self, data_folder, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts = []
        self.labels = []

        # Read all files in the directory
        for filename in os.listdir(data_folder):
            filepath = os.path.join(data_folder, filename)
            if os.path.isfile(filepath) and filename.endswith('.csv'):
                # Assume the CSV format is correct (update indices as necessary)
                df = pd.read_csv(filepath, delimiter=',')
                self.texts.extend(df.iloc[:, 1].tolist())  # Adjust based on your data
                # Convert labels to integers
                self.labels.extend(df.iloc[:, 2].apply(lambda x: LABEL_TO_NUMBER[x]).tolist())  # Adjust based on your data

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Set up the tokenizer and dataset
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
data_folder = '/content/raw_data'  # Update path as needed
dataset = PrivacyPolicyDataset(data_folder, tokenizer, max_len=128)

# # Create a DataLoader
# loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
import random
from torch.utils.data import Subset, DataLoader

def split_dataset(dataset, train_size=0.65):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(train_size * dataset_size)
    random.shuffle(indices)
    train_indices, test_indices = indices[:split], indices[split:]

    # Splitting the dataset into training and testing subsets
    train_subset = Subset(dataset, train_indices)
    test_subset = Subset(dataset, test_indices)

    return train_subset, test_subset

# Split the dataset
train_subset, test_subset = split_dataset(dataset)

# Create DataLoader for each subset
train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False)


In [None]:
for batch in train_loader:
    example_item = batch
    break  # This will only take the first batch from the DataLoader

# Now, print details about one item from the batch
print("Input IDs:", example_item['input_ids'][0])  # Print the first input_ids in the batch
print("Attention Mask:", example_item['attention_mask'][0])  # Print the first attention mask in the batch
print("Labels:", example_item['labels'][0])  # Print the first labels in the batch


In [None]:
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader

class PrivacyPolicyEmbedding:
    def __init__(self, model_name='nlpaueb/legal-bert-base-uncased', device='cuda'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.device = device
        self.model.to(self.device)
        self.model.eval()  # Set the model to evaluation mode

    def get_embeddings(self, dataloader):
        all_embeddings = []
        all_labels = []
        with torch.no_grad():
            for batch in dataloader:
                inputs = {key: val.to(self.device) for key, val in batch.items() if key != 'labels'}
                outputs = self.model(**inputs)
                embeddings = outputs.last_hidden_state.mean(1)  # Mean pooling
                all_embeddings.append(embeddings.cpu().numpy())  # Transfer embeddings back to CPU and convert to numpy
                all_labels.append(batch['labels'].cpu().numpy())  # Convert labels to numpy

        all_embeddings = np.concatenate(all_embeddings, axis=0)  # Combine batches
        all_labels = np.concatenate(all_labels, axis=0)  # Combine labels

        return all_embeddings, all_labels

# Initialize PrivacyPolicyEmbedding object
embedder = PrivacyPolicyEmbedding()

# Get embeddings for the training dataset
train_embeddings, train_labels = embedder.get_embeddings(train_loader)

# Get embeddings for the testing dataset
test_embeddings, test_labels = embedder.get_embeddings(test_loader)


In [None]:
train_embeddings.shape

In [None]:
# import random
# from torch.utils.data import Subset, DataLoader

# def split_dataset_embeddings(embeddings, labels, train_size=0.65):
#     dataset_size = len(embeddings)
#     indices = list(range(dataset_size))
#     split = int(train_size * dataset_size)
#     random.shuffle(indices)
#     train_indices, test_indices = indices[:split], indices[split:]

#     # Creating PT data samplers and loaders for embeddings:
#     train_embeddings, train_labels = embeddings[train_indices], labels[train_indices]
#     test_embeddings, test_labels = embeddings[test_indices], labels[test_indices]

#     return (train_embeddings, train_labels), (test_embeddings, test_labels)

# # Assuming 'embeddings' and 'labels' are obtained after embedding
# (train_embeddings, train_labels), (test_embeddings, test_labels) = split_dataset_embeddings(embeddings, labels)


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def calculate_metrics(preds, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return precision, recall, f1, accuracy


In [None]:
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

# Define hyperparameters
num_labels = len(LABEL_TO_NUMBER)
num_epochs = 15  # Adjust based on your training configuration
num_batches_per_epoch = len(train_loader)
num_training_steps = num_epochs * num_batches_per_epoch
num_warmup_steps = int(0.1 * num_training_steps)  # Set to 10% of total training steps
num_training_steps = 1000  # Adjust based on your training configuration
device = torch.device('cuda')

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=num_labels)
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
loss_func = CrossEntropyLoss()

# Training loop with metrics
for epoch in range(num_epochs):
    model.train()
    train_predictions = []
    train_true_labels = []
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        train_predictions.extend(predicted.detach().cpu().numpy())
        train_true_labels.extend(labels.detach().cpu().numpy())

    train_loss = loss.item()
    print(train_loss)
    train_precision, train_recall, train_f1, train_accuracy = calculate_metrics(train_predictions, train_true_labels)
    print(train_precision)
    print(train_recall)
    print(train_f1)
    print(train_accuracy)
    print(f'Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}, Accuracy: {train_accuracy:.4f}')

    # Evaluation phase
    model.eval()
    eval_predictions = []
    eval_true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            eval_predictions.extend(predicted.detach().cpu().numpy())
            eval_true_labels.extend(labels.detach().cpu().numpy())

    eval_precision, eval_recall, eval_f1, eval_accuracy = calculate_metrics(eval_predictions, eval_true_labels)
    print(f'Epoch {epoch + 1}/{num_epochs} - Eval Precision: {eval_precision:.4f}, Eval Recall: {eval_recall:.4f}, Eval F1: {eval_f1:.4f}, Eval Accuracy: {eval_accuracy:.4f}')


In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg


In [None]:
import spacy
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Load the English NLP model
nlp = spacy.load('en_core_web_lg')

# OPP-115 Categories
opp_categories = [
    "First Party Collection/Use",
    "Third Party Sharing/Collection",
    "User Choice/Control",
    "User Access, Edit, and Deletion",
    "Data Retention",
    "Data Security",
    "Policy Change",
    "Do Not Track",
    "International and Specific Audiences",
    "Other"
]

# GDPR Article 5 Principles
gdpr_principles = [
    "processed lawfully, fairly and in a transparent manner (‘lawfulness, fairness and transparency’)",
    "collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes (‘purpose limitation’)",
    "adequate, relevant and limited to what is necessary (‘data minimisation’)",
    "accurate and, where necessary, kept up to date (‘accuracy’)",
    "kept in a form which permits identification of data subjects for no longer than necessary (‘storage limitation’)",
    "processed in a manner that ensures security of the data (‘integrity and confidentiality’)",
    "the controller shall be responsible for and be able to demonstrate compliance (‘accountability’)"
]

# Create embeddings for OPP-115 categories and GDPR principles
opp_embeddings = np.array([nlp(text).vector for text in opp_categories])
gdpr_embeddings = np.array([nlp(text).vector for text in gdpr_principles])

# Combine embeddings
combined_embeddings = np.vstack((opp_embeddings, gdpr_embeddings))

# Perform clustering
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='cosine', linkage='complete')
clustering.fit(combined_embeddings)

# Assign clusters
clusters = clustering.labels_

# Print mappings
for i, category in enumerate(opp_categories + gdpr_principles):
    print(f"{category} is mapped to cluster {clusters[i]}")


In [None]:
import spacy
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import networkx as nx
import matplotlib.pyplot as plt

# Load the English NLP model
nlp = spacy.load('en_core_web_lg')

# OPP-115 Categories
opp_categories = [
    "First Party Collection/Use",
    "Third Party Sharing/Collection",
    "User Choice/Control",
    "User Access, Edit, and Deletion",
    "Data Retention",
    "Data Security",
    "Policy Change",
    "Do Not Track",
    "International and Specific Audiences",
    "Other"
]

# GDPR Article 5 Principles
gdpr_principles = [
    "lawfulness, fairness and transparency",
    "purpose limitation",
    "data minimisation",
    "accuracy",
    "storage limitation",
    "integrity and confidentiality",
    "accountability"
]

# Embeddings
opp_embeddings = np.array([nlp(text).vector for text in opp_categories])
gdpr_embeddings = np.array([nlp(text).vector for text in gdpr_principles])

# Combine and cluster
combined_embeddings = np.vstack((opp_embeddings, gdpr_embeddings))
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.6, affinity='cosine', linkage='complete')
clustering.fit(combined_embeddings)

# Create a graph
G = nx.Graph()

# Add nodes with labels
for text in opp_categories + gdpr_principles:
    G.add_node(text, label=text)

# Add edges based on clusters
for i in range(len(opp_categories)):
    for j in range(len(gdpr_principles)):
        if clustering.labels_[i] == clustering.labels_[len(opp_categories) + j]:
            G.add_edge(opp_categories[i], gdpr_principles[j])

# Draw the graph
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, seed=42)  # positions for all nodes
nx.draw_networkx_nodes(G, pos, node_size=7000, node_color='skyblue')
nx.draw_networkx_edges(G, pos, width=2)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Mapping OPP-115 Categories to GDPR Principles")
plt.show()

# Print mappings in a readable format
cluster_map = {}
all_labels = opp_categories + gdpr_principles  # Create a single list of all labels

for i, label in enumerate(clustering.labels_):
    cluster_map.setdefault(label, []).append(all_labels[i])  # Append correct item

# Display readable mappings
for cluster, texts in cluster_map.items():
    opps = [t for t in texts if t in opp_categories]
    gdprs = [t for t in texts if t in gdpr_principles]
    if opps and gdprs:
        print(f"{' and '.join(opps)} map to {', '.join(gdprs)}")


In [None]:
!pip install transformers


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Encode text to get input ids and attention mask
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        output = model(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = output.last_hidden_state[:,0,:].numpy()  # Take the embeddings of the [CLS] token
    return embeddings

# OPP-115 Categories and GDPR Principles as lists of strings
opp_categories = [
    "First Party Collection/Use: how and why a service provider collects user information.",
    "Third Party Sharing/Collection: how user information may be shared with or collected by third parties.",
    "User Choice/Control: choices and control options available to users.",
    "User Access, Edit, & Deletion: if and how users may access, edit, or delete their information.",
    "Data Retention: how long user information is stored.",
    "Data Security: how user information is protected.",
    "Policy Change: if and how users will be informed about changes to the privacy policy.",
    "Do Not Track: if and how Do Not Track signals for online tracking and advertising are honored.",
    "International & Specific Audiences: practices that pertain only to a specific group of users (e.g., children, Europeans, or California residents).",
    "Other: additional sub-labels for introductory or general text, contact information, and practices not covered by the other categories."
]

gdpr_principles = [
    "Lawfulness, Fairness and Transparency: processed lawfully, fairly and in a transparent manner",
    "Purpose Limitation: collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes",
    "Data Minimisation: adequate, relevant and limited to what is necessary",
    "Accuracy: accurate and, where necessary, kept up to date",
    "Storage Limitation: kept in a form which permits identification of data subjects for no longer than necessary",
    "Integrity and Confidentiality: processed in a manner that ensures security of the data",
    "Accountability: the controller shall be responsible for and be able to demonstrate compliance"
]

# Get embeddings
opp_embeddings = np.vstack([get_bert_embeddings(cat) for cat in opp_categories])
gdpr_embeddings = np.vstack([get_bert_embeddings(prin) for prin in gdpr_principles])

# Compute cosine similarity between each category and principle
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(opp_embeddings, gdpr_embeddings)

# Display the top matches for each OPP category based on a similarity threshold
similarity_threshold = 0.85  # Define your own threshold here
for i, category in enumerate(opp_categories):
    print(f"\n{category[:11]} is similar to the following GDPR principles with a similarity above {similarity_threshold}:")
    for j, principle in enumerate(gdpr_principles):
        if similarity_matrix[i][j] > similarity_threshold:
            print(f"  {principle[:12]} (Similarity: {similarity_matrix[i][j]:.2f})")


In [None]:
!unzip OPP-115_v1_0.zip -d OPP-115_v1_0


In [None]:
!pip install spacy beautifulsoup4
# !pip -m spacy download en_core_web_sm


In [None]:
import spacy
import os
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text(separator=" ")

    doc = nlp(clean_text)

    # Use lemmatization and lowercasing
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct]

    clean_text = " ".join(tokens)
    return clean_text


preprocessed_policies_dict = {}
directory = './OPP-115_v1_0/OPP-115/sanitized_policies'

for filename in os.listdir(directory):
    if filename.endswith('.html'):
        base_name = '_'.join(filename.split('_')[1:]).rsplit('.', 1)[0]
        path = os.path.join(directory, filename)
        with open(path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            preprocessed_text = preprocess_text(html_content)
            preprocessed_policies_dict[base_name] = preprocessed_text


In [None]:
import pandas as pd
import os

annotations_dir = './OPP-115_v1_0/OPP-115/annotations/'

# A dictionary to hold all dataframes, key will be the base filename
dataframes = {}

for annotation_filename in os.listdir(annotations_dir):
    if annotation_filename.endswith('.csv'):
        base_name = '_'.join(annotation_filename.split('_')[1:]).rsplit('.', 1)[0]

        annotation_path = os.path.join(annotations_dir, annotation_filename)

        annotation_df = pd.read_csv(annotation_path, header=None)

        dataframes[base_name] = {'categories': annotation_df[5], 'preprocessed_text': preprocessed_policies_dict.get(base_name)}


yahoo_df = dataframes['yahoo.com']
yahoo_df['categories']


In [None]:
from collections import defaultdict
import pandas as pd

# Initialize a list to hold the final consolidated dataset
final_dataset = []

for base_name, data in dataframes.items():
    unique_categories_per_text = defaultdict(set)

    for category in data['categories']:
        unique_categories_per_text[data['preprocessed_text']].add(category)

    for text, categories_set in unique_categories_per_text.items():
        final_dataset.append({
            'source': base_name,
            'preprocessed_text': text,
            'categories': list(categories_set)
        })

final_df = pd.DataFrame(final_dataset)

# Display the shape of the DataFrame and the first few rows to verify the structure
print(final_df.shape)
final_df.head()


In [None]:
final_df.iloc[0]['preprocessed_text']

In [None]:
# First Party Collection/Use	- maps to -> Lawfulness, Fairness, Transparency | Purpose Limitation | Data Minimization

# Third Party Sharing/Collection - maps to ->	Lawfulness, Fairness, Transparency | Purpose Limitation | Data Minimization

# User Choice/Control	 - maps to -> Lawfulness, Fairness, Transparency

# User Access, Edit, and Deletion  - maps to ->	Lawfulness, Fairness, Transparency | Accuracy

# Data Retention  - maps to ->	Storage Limitation

# Data Security  - maps to -> Integrity and Confidentiality

# Policy Change	  - maps to ->	 Lawfulness, Fairness, Transparency

# Do Not Track

# International and Specific Audiences	  - maps to ->	 Lawfulness, Fairness, Transparency

# Other	(No Direct Mapping)

label_mapping = {
    "First Party Collection/Use": [1, 2, 3],
    "Third Party Sharing/Collection": [1, 2, 3],
    "User Choice/Control": [1],
    "User Access, Edit, and Deletion": [1, 4],
    "Data Retention": [5],
    "Data Security": [6],
    "Policy Change": [1],
    "Do Not Track": [],
    "International and Specific Audiences": [1],
    "Other": []
}

# where:

# 1 is lawfulness, fairness and transparency

# 2 is purpose limitation

# 3 is data minimization

# 4 is accuracy

# 5 is storage limitation

# 6 is integrity and confidentiality

# 7 is accountability

def transform_document_labels(document_labels):
    document_new_labels_set = set()
    for label in document_labels:
        # Add the new categories to the set if the label is in the mapping
        if label in label_mapping:
            document_new_labels_set.update(label_mapping[label])
    return sorted(list(document_new_labels_set))

final_df['gdpr_principles'] = final_df['categories'].apply(transform_document_labels)

total_principles = 7

final_df['compliance_percentage'] = final_df['gdpr_principles'].apply(lambda x: (len(x) / total_principles) * 100)

final_df.head()



In [None]:
# preprocessed_policies = final_df['preprocessed_text'].tolist()

import pandas as pd
import ast

df = pd.read_csv('./sentences_gdpr_labels.csv')
df['sentence_labels'] = df['sentence_labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

preprocessed_policies = df['sentence_text'].tolist()


In [None]:
!pip install openai

In [None]:
%env OPENAI_API_KEY=...

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def get_embeddings(texts):
    ada_embeddings = []
    for text in texts:
        response = client.embeddings.create(
            model="text-embedding-ada-002",
            input=text,
            encoding_format="float"
        )
        ada_embeddings.append(response.data[0].embedding)
    return ada_embeddings

# Example usage with your preprocessed privacy policies
ada_embeddings = get_embeddings(preprocessed_policies)
ada_embeddings


In [None]:
import torch
import ast

class PrivacyPolicyDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

df['sentence_labels'] = df['sentence_labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
labels = df['sentence_labels'].tolist()

# Assuming `labels` is a list of integer labels corresponding to categories
dataset = PrivacyPolicyDataset(ada_embeddings, labels)


In [None]:
ada_embeddings.shape

In [None]:
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, embedding_dim, num_labels):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_labels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

num_classes = 7

# Initialize the model
model = Classifier(embedding_dim=768, num_labels=num_classes)  # Adjust `num_classes` as needed


In [None]:
from torch.optim import Adam
from torch.utils.data import DataLoader

optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

for epoch in range(10):  # number of epochs
    for embeddings, labels in data_loader:
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


In [None]:
# Assume `val_dataset` prepared similarly to `train_dataset`
val_loader = DataLoader(val_dataset, batch_size=16)
model.eval()  # Set the model to evaluation mode
accuracy = 0
with torch.no_grad():
    for embeddings, labels in val_loader:
        outputs = model(embeddings)
        _, predicted = torch.max(outputs.data, 1)
        accuracy += (predicted == labels).sum().item()

accuracy /= len(val_dataset)
print(f"Validation Accuracy: {accuracy}")
