In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('./sentences_gdpr_labels.csv')

# Display the first few rows of the DataFrame to check its contents
print(df.head(5))


       policy_name  sentence_id   
0  1221_gwdocs.com            0  \
1  1221_gwdocs.com            1   
2  1221_gwdocs.com            2   
3  1221_gwdocs.com            3   
4  1221_gwdocs.com            4   

                                       sentence_text        sentence_labels   
0  Privacy Policy The GW Medical Faculty Associat...  [0, 0, 0, 0, 0, 0, 0]  \
1   In general you can visit The MFA on the Web w...  [1, 1, 1, 0, 0, 0, 0]   
2   In some areas however you may choose services...  [1, 1, 1, 0, 0, 0, 0]   
3   Once any personally identifiable information ...  [1, 1, 1, 0, 0, 0, 0]   
4   The MFA uses the information collected from y...  [1, 1, 1, 0, 0, 0, 0]   

   sentence_length  is_included  
0               14         True  
1               19         True  
2               23         True  
3               25         True  
4               21         True  


In [2]:
# Calculate the percentage of 'is_included' that are True
percentage_included = df['is_included'].mean() * 100

# Print the result
print(f"Percentage of 'is_included' that is True: {percentage_included:.2f}%")


Percentage of 'is_included' that is True: 86.91%


In [3]:
# Filter the DataFrame to only include rows where 'is_included' is True
df_filtered = df[df['is_included'] == True]

# Check the first few rows of the new DataFrame to confirm the rows are dropped
print(df_filtered.head())

# Optionally, you can view how many rows remain in the filtered DataFrame
print("Number of rows in the filtered DataFrame:", len(df_filtered))


       policy_name  sentence_id   
0  1221_gwdocs.com            0  \
1  1221_gwdocs.com            1   
2  1221_gwdocs.com            2   
3  1221_gwdocs.com            3   
4  1221_gwdocs.com            4   

                                       sentence_text        sentence_labels   
0  Privacy Policy The GW Medical Faculty Associat...  [0, 0, 0, 0, 0, 0, 0]  \
1   In general you can visit The MFA on the Web w...  [1, 1, 1, 0, 0, 0, 0]   
2   In some areas however you may choose services...  [1, 1, 1, 0, 0, 0, 0]   
3   Once any personally identifiable information ...  [1, 1, 1, 0, 0, 0, 0]   
4   The MFA uses the information collected from y...  [1, 1, 1, 0, 0, 0, 0]   

   sentence_length  is_included  
0               14         True  
1               19         True  
2               23         True  
3               25         True  
4               21         True  
Number of rows in the filtered DataFrame: 10672


In [4]:
df = df[df['is_included'] == True]


In [5]:
import ast

# Convert string representations of lists into actual lists
df['sentence_labels'] = df['sentence_labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verify the conversion
print(df['sentence_labels'].head())


0    [0, 0, 0, 0, 0, 0, 0]
1    [1, 1, 1, 0, 0, 0, 0]
2    [1, 1, 1, 0, 0, 0, 0]
3    [1, 1, 1, 0, 0, 0, 0]
4    [1, 1, 1, 0, 0, 0, 0]
Name: sentence_labels, dtype: object


In [6]:
import torch

# Assuming 'sentence_labels' is already in the correct list format
labels_tensor = torch.tensor(df['sentence_labels'].tolist(), dtype=torch.float32)


In [7]:
labels_tensor.shape

torch.Size([10672, 7])

In [8]:
policies = df['sentence_text'].tolist()


In [9]:
!pip install -U transformers
!pip install accelerate -U


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [10]:
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPT2Model.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

assert model.config.pad_token_id is not None, "Pad token not set in model configuration."

model.resize_token_embeddings(len(tokenizer))

# Generate embeddings for each sentence
def get_gpt2_embeddings(text):
    # Encode text to get input ids and attention mask
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    model.eval()
    with torch.no_grad():
        output = model(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = output.last_hidden_state[:, -1, :]
    return embeddings.numpy()

sentence_embeddings = np.array([get_gpt2_embeddings(sentence) for sentence in policies])


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
sentence_embeddings.shape


(10672, 1, 768)

In [12]:
# Flatten the embeddings by removing the middle dimension
sentence_embeddings = sentence_embeddings.squeeze(axis=1)
sentence_embeddings.shape


(10672, 768)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sentence_embeddings, labels_tensor, test_size=0.2, random_state=42)


In [14]:
from torch.utils.data import DataLoader, TensorDataset
import torch

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=16)


  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))


In [15]:
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TransformerClassifier(nn.Module):
    def __init__(self, num_features, num_labels, num_layers=1, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=num_features,
            nhead=8,  # Number of attention heads
            dim_feedforward=2048,  # Dimension of the feedforward network
            dropout=dropout
        )
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(num_features, num_labels)

    def forward(self, x):
        if x.dim() == 2:  # Check if the input is 2D and adjust
            x = x.unsqueeze(1)  # Add a sequence length dimension of 1
        # No need to permute for a single timestep
        x = self.transformer(x)
        x = x.mean(dim=1)  # Pooling operation - could also be max or last token
        return torch.sigmoid(self.output_layer(x))

# Initialize model
model = TransformerClassifier(num_features=768, num_labels=7, num_layers=2, dropout=0.1)
model.to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)  # Decays the learning rate by 0.95 every epoch

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f'Epoch {epoch + 1}: Loss = {loss.item()}')




Epoch 1: Loss = 0.3144153952598572
Epoch 2: Loss = 0.378030389547348
Epoch 3: Loss = 0.5540672540664673
Epoch 4: Loss = 0.3359470069408417
Epoch 5: Loss = 0.5924244523048401
Epoch 6: Loss = 0.5172045826911926
Epoch 7: Loss = 0.4706898033618927
Epoch 8: Loss = 0.3744139075279236
Epoch 9: Loss = 0.3962293565273285
Epoch 10: Loss = 0.2517908811569214


In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = (outputs > 0.5).int()  # Threshold predictions
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='samples')
accuracy = accuracy_score(true_labels, predictions)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')



Precision: 0.6600, Recall: 0.7500, F1 Score: 0.6772, Accuracy: 0.5199


  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# Save the entire model
torch.save(model, './model.pth')

# Or save only the state dictionary
torch.save(model.state_dict(), './model_state_dict.pth')


In [17]:
import torch

# Sample text
sample_text = "We reveal only the last four digits of your credit card numbers when confirming an order."
sample_text = "Our site includes third-party advertising and links to other Web sites."

# Encode the text using the same SBERT model used for training
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

gpt_model = GPT2Model.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

assert gpt_model.config.pad_token_id is not None, "Pad token not set in model configuration."

gpt_model.resize_token_embeddings(len(tokenizer))

gpt_model.to(device)  # Ensure BERT is on the correct device
gpt_model.eval()  # Set BERT to evaluation mode

def get_gpt_embeddings(text_list):
    # Tokenize and encode sentences for BERT input
    encoded_input = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}  # Move to the same device as model
    with torch.no_grad():
        outputs = gpt_model(**encoded_input)
    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state[:, -1, :]
    return embeddings.numpy()

sample_embedding = get_gpt_embeddings([sample_text])  # Ensure it's in a list for batch processing

# Convert to tensor
sample_tensor = torch.tensor(sample_embedding, dtype=torch.float32).to(device)

# Make prediction
with torch.no_grad():
    output = model(sample_tensor)
    predicted = (output > 0.5).int().cpu().numpy()  # Apply threshold to get binary labels

# Output the predicted labels
print(predicted[0])  



[1 1 1 0 0 0 0]
