In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('./sentences_gdpr_labels.csv')

# Display the first few rows of the DataFrame to check its contents
print(df.head(5))


       policy_name  sentence_id   
0  1221_gwdocs.com            0  \
1  1221_gwdocs.com            1   
2  1221_gwdocs.com            2   
3  1221_gwdocs.com            3   
4  1221_gwdocs.com            4   

                                       sentence_text        sentence_labels   
0  Privacy Policy The GW Medical Faculty Associat...  [0, 0, 0, 0, 0, 0, 0]  \
1   In general you can visit The MFA on the Web w...  [1, 1, 1, 0, 0, 0, 0]   
2   In some areas however you may choose services...  [1, 1, 1, 0, 0, 0, 0]   
3   Once any personally identifiable information ...  [1, 1, 1, 0, 0, 0, 0]   
4   The MFA uses the information collected from y...  [1, 1, 1, 0, 0, 0, 0]   

   sentence_length  is_included  
0               14         True  
1               19         True  
2               23         True  
3               25         True  
4               21         True  


In [2]:
# Calculate the percentage of 'is_included' that are True
percentage_included = df['is_included'].mean() * 100

# Print the result
print(f"Percentage of 'is_included' that is True: {percentage_included:.2f}%")


Percentage of 'is_included' that is True: 86.91%


In [3]:
# Filter the DataFrame to only include rows where 'is_included' is True
df_filtered = df[df['is_included'] == True]

# Check the first few rows of the new DataFrame to confirm the rows are dropped
print(df_filtered.head())

# Optionally, you can view how many rows remain in the filtered DataFrame
print("Number of rows in the filtered DataFrame:", len(df_filtered))


       policy_name  sentence_id   
0  1221_gwdocs.com            0  \
1  1221_gwdocs.com            1   
2  1221_gwdocs.com            2   
3  1221_gwdocs.com            3   
4  1221_gwdocs.com            4   

                                       sentence_text        sentence_labels   
0  Privacy Policy The GW Medical Faculty Associat...  [0, 0, 0, 0, 0, 0, 0]  \
1   In general you can visit The MFA on the Web w...  [1, 1, 1, 0, 0, 0, 0]   
2   In some areas however you may choose services...  [1, 1, 1, 0, 0, 0, 0]   
3   Once any personally identifiable information ...  [1, 1, 1, 0, 0, 0, 0]   
4   The MFA uses the information collected from y...  [1, 1, 1, 0, 0, 0, 0]   

   sentence_length  is_included  
0               14         True  
1               19         True  
2               23         True  
3               25         True  
4               21         True  
Number of rows in the filtered DataFrame: 10672


In [4]:
df = df[df['is_included'] == True]


In [5]:
import ast

# Convert string representations of lists into actual lists
df['sentence_labels'] = df['sentence_labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verify the conversion
print(df['sentence_labels'].head())


0    [0, 0, 0, 0, 0, 0, 0]
1    [1, 1, 1, 0, 0, 0, 0]
2    [1, 1, 1, 0, 0, 0, 0]
3    [1, 1, 1, 0, 0, 0, 0]
4    [1, 1, 1, 0, 0, 0, 0]
Name: sentence_labels, dtype: object


In [10]:
import torch

# Assuming 'sentence_labels' is already in the correct list format
labels_tensor = torch.tensor(df['sentence_labels'].tolist(), dtype=torch.float32)


In [5]:
labels_tensor.shape

torch.Size([12279, 7])

In [7]:
policies = df['sentence_text'].tolist()


In [8]:
!pip install sentence-transformers


Defaulting to user installation because normal site-packages is not writeable


In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each sentence
sentence_embeddings = model.encode(policies)


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
sentence_embeddings.shape


(12279, 384)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sentence_embeddings, labels_tensor, test_size=0.2, random_state=42)


In [21]:
from torch.utils.data import DataLoader, TensorDataset
import torch

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=16)


  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
  test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))


In [22]:
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

class TransformerClassifier(nn.Module):
    def __init__(self, num_features, num_labels, num_layers=1, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        transformer_layer = nn.TransformerEncoderLayer(
            d_model=num_features,
            nhead=8,  # Number of attention heads
            dim_feedforward=2048,  # Dimension of the feedforward network
            dropout=dropout
        )
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(num_features, num_labels)

    def forward(self, x):
        if x.dim() == 2:  # Check if the input is 2D and adjust
            x = x.unsqueeze(1)  # Add a sequence length dimension of 1
        # No need to permute for a single timestep
        x = self.transformer(x)
        x = x.mean(dim=1)  # Pooling operation - could also be max or last token
        return torch.sigmoid(self.output_layer(x))

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Initialization
model = TransformerClassifier(num_features=384, num_labels=7, num_layers=2, dropout=0.1)
model.to(device)

# Data preparation (assuming X_train, y_train are ready and loaded as tensors)
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Optimizer
optimizer = Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)  # Decays the learning rate by 0.95 every epoch

# Training loop
model.train()
for epoch in range(10):  # Modify epochs as needed
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f'Epoch {epoch + 1}: Loss = {loss.item()}')


  train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))


In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predicted = (outputs > 0.5).int()  # Apply a threshold to get binary predictions
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='samples')
accuracy = accuracy_score(true_labels, predictions)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')



Precision: 0.7245, Recall: 0.7011, F1 Score: 0.6914, Accuracy: 0.6047


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save the entire model
torch.save(model, './model.pth')

# Or save only the state dictionary
torch.save(model.state_dict(), './model_state_dict.pth')


In [19]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model (ensure it's loaded or define it if it's not already in the environment)
model.eval()  # Set the model to evaluation mode

# Sample text
sample_text = "We reveal only the last four digits of your credit card numbers when confirming an order."
sample_text = "Our site includes third-party advertising and links to other Web sites."

# Encode the text using the same SBERT model used for training
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sample_embedding = sbert_model.encode([sample_text])  # Ensure it's in a list for batch processing

# Convert to tensor
sample_tensor = torch.tensor(sample_embedding, dtype=torch.float32).to(device)

# Make prediction
with torch.no_grad():
    output = model(sample_tensor)
    predicted = (output > 0.5).int().cpu().numpy()  # Apply threshold to get binary labels

# Output the predicted labels in the format [1, 1, 1, 0, 0, 0, 0]
print(predicted[0])




[0 0 0 0 0 0 0]
