Import Package

In [1]:
from dotenv import load_dotenv
import os
from groq import Groq

import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import json


In [2]:
load_dotenv()
api_key = os.getenv("GROQ_API_KEY")

if not api_key:
    raise ValueError("GROQ_API_KEY is not set!")

client = Groq(api_key=api_key)


In [3]:
import requests

url = "https://api.groq.com/openai/v1/models"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

response = requests.get(url, headers=headers)

print(response.json())

{'object': 'list', 'data': [{'id': 'qwen-2.5-coder-32b', 'object': 'model', 'created': 1739494572, 'owned_by': 'Alibaba Cloud', 'active': True, 'context_window': 131072, 'public_apps': None, 'max_completion_tokens': 131072}, {'id': 'playai-tts', 'object': 'model', 'created': 1740682771, 'owned_by': 'PlayAI', 'active': True, 'context_window': 8192, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'deepseek-r1-distill-qwen-32b', 'object': 'model', 'created': 1738891590, 'owned_by': 'DeepSeek / Alibaba Cloud', 'active': True, 'context_window': 131072, 'public_apps': None, 'max_completion_tokens': 131072}, {'id': 'gemma2-9b-it', 'object': 'model', 'created': 1693721698, 'owned_by': 'Google', 'active': True, 'context_window': 8192, 'public_apps': None, 'max_completion_tokens': 8192}, {'id': 'llama-3.3-70b-versatile', 'object': 'model', 'created': 1733447754, 'owned_by': 'Meta', 'active': True, 'context_window': 131072, 'public_apps': None, 'max_completion_tokens': 32768}, {'id': 

Load Datasets

In [4]:
train_data = pd.read_csv("data/Majority/train.csv")
val_data = pd.read_csv("data/Majority/val.csv")
test_data = pd.read_csv("data/Majority/test.csv")


In [5]:
train_data.head()


Unnamed: 0,id,text,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information
0,0,Information that Sci-News.com May Collect Onli...,1,0,0,0,0,0,0,0,0,0,0,0
1,1,#NAME?,1,0,0,0,0,0,0,0,0,0,0,0
2,2,Use of Cookies Sci-News.com uses cookie techno...,1,0,0,0,0,0,0,0,0,0,0,0
3,3,During the course of any visit to the Sci-News...,1,0,0,0,0,0,0,0,0,0,0,0
4,4,Please note that during or after your visits t...,0,1,0,0,0,0,0,0,0,0,0,0


In [6]:
val_data.head()

Unnamed: 0,id,text,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information
0,0,Sci-News.com does not knowingly collect or sol...,0,0,0,0,0,1,0,0,0,0,0,1
1,1,Submitting an Order When you submit an order w...,1,1,0,0,0,0,0,0,0,0,0,0
2,2,We use cookies to enhance the browsing and sho...,1,0,0,0,0,0,0,0,0,1,0,0
3,3,This privacy statement covers the site new.www...,0,0,0,0,0,0,0,0,0,1,0,0
4,4,Information Collection and Use Information Col...,1,0,0,0,0,0,0,0,0,0,1,0


In [7]:
test_data.head()

Unnamed: 0,id,text,First Party Collection/Use,Third Party Sharing/Collection,User Access Edit and Deletion,Data Retention,Data Security,International and Specific Audiences,Do Not Track,Policy Change,User Choice/Control,Introductory/Generic,Practice not covered,Privacy contact information
0,0,Privacy Policy Sci-News.com is committed to pr...,0,0,0,0,0,0,0,0,0,1,0,0
1,1,"- if you contact us, we may keep a record of t...",0,0,0,1,0,0,0,0,0,0,0,0
2,2,Walmart Privacy Policy Last Updated: March 2015,0,0,0,0,0,0,0,0,0,1,0,0
3,3,"Our founder Sam Walton reminded us that """"a pr...",0,0,0,0,0,0,0,0,0,1,0,0
4,4,We will remove you and your personally identif...,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
class GroqPrivacyClassifier:
    def __init__(self, model_name="llama-guard-3-8b"):  # or "llama2-70b-4096"
        self.model_name = model_name
        self.labels = [
            'First Party Collection/Use',
            'Third Party Sharing/Collection',
            'User Access Edit and Deletion',
            'Data Retention',
            'Data Security',
            'International and Specific Audiences',
            'Do Not Track',
            'Policy Change',
            'User Choice/Control',
            'Introductory/Generic',
            'Practice not covered',
            'Privacy contact information'
        ]
        self.prompt_template = """[INST] Analyze this privacy policy snippet and output ONLY 12 binary values (0/1) corresponding to these categories:
1. First Party Collection/Use
2. Third Party Sharing/Collection
3. User Access Edit and Deletion
4. Data Retention
5. Data Security
6. International and Specific Audiences
7. Do Not Track
8. Policy Change
9. User Choice/Control
10. Introductory/Generic
11. Practice not covered
12. Privacy contact information

Example Output: 0,1,0,0,1,0,0,0,0,0,0,1

Text: "{text}" [/INST]"""
    
    def predict_single(self, text, max_retries=3):
        for _ in range(max_retries):
            try:
                response = client.chat.completions.create(
                    messages=[{
                        "role": "user",
                        "content": self.prompt_template.format(text=text)
                    }],
                    model=self.model_name,
                    temperature=0.01,  
                    max_tokens=50
                )
                pred = response.choices[0].message.content.strip()
                return [int(x) for x in pred.split(',')[:12]]  
            except Exception as e:
                print(f"Error: {e}, retrying...")
                time.sleep(1)
        return [0] * 12 
    
    def predict_batch(self, texts, batch_size=10, delay=0.1):
        results = []
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            batch_results = []
            
            for text in batch:
                batch_results.append(self.predict_single(text))
                time.sleep(delay)  
            
            results.extend(batch_results)
        return np.array(results)

In [9]:
classifier = GroqPrivacyClassifier(model_name="llama-guard-3-8b")

In [10]:
import math

def learn_patterns(train_data, sample_size=50, batch_size=10):
    """Let the model learn relationships between text segments and categories using batching"""
    patterns = {label: [] for label in classifier.labels}
    
    # Sample representative positive examples for each category
    for label in tqdm(classifier.labels):
        pos_samples = train_data[train_data[label]==1].sample(
            min(sample_size, sum(train_data[label]==1)))
        
        # Split the positive samples into batches
        num_batches = math.ceil(len(pos_samples) / batch_size)
        
        for batch_idx in range(num_batches):
            batch = pos_samples.iloc[batch_idx * batch_size : (batch_idx + 1) * batch_size]
            
            # Prepare prompts for the batch
            prompts = []
            for _, row in batch.iterrows():
                prompt = f"""Analyze why this privacy policy segment is labeled as [{label}]=1:
                
                Text: "{row['text']}"
                
                Extract 3+ key features (e.g., specific phrases, sentence structures),
                and summarize the decision rules for identifying this category."""
                prompts.append(prompt)
            
            # Make API requests for the entire batch
            responses = client.chat.completions.create(
                model="llama-guard-3-8b",
                messages=[{"role": "user", "content": prompt} for prompt in prompts],
                temperature=0.3
            )
            
            # Process responses for the batch
            for i, response in enumerate(responses.choices):
                patterns[label].append(response.message.content)
    
    # Generate a rulebook
    rulebook = {}
    for label, examples in patterns.items():
        prompt = f"""Based on these analyses, summarize universal rules for identifying [{label}]:
        {examples}
        
        Required Format:
        1. Key Phrases (min 5)
        2. Typical Sentence Structures (min 3)
        3. Common Negation Patterns (if present)"""
        
        response = client.chat.completions.create(
            model="gemma2-9b-it", # gemma2-9b-it llama-3.3-70b-versatile
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        rulebook[label] = response.choices[0].message.content
    
    return rulebook

# Execute pattern learning
rulebook = learn_patterns(train_data)
print("Generated Rulebook Example:\n", rulebook['First Party Collection/Use'])


100%|██████████| 12/12 [05:21<00:00, 26.80s/it]


Generated Rulebook Example:
 ## Universal Rules for Identifying First Party Collection/Use:

Based on the provided analyses indicating all instances as "safe," we can infer the following universal rules for identifying first party collection/use:

**1. Key Phrases (min 5):**

* **"Our website"**
* **"Our app"**
* **"User account"**
* **"Customer data"**
* **"Directly collected"**

**2. Typical Sentence Structures (min 3):**

* **"We collect [data type] directly from users through [method] on our website."**
* **"Your user account information is used to personalize your experience on our platform."**
* **"We may use customer data to send you marketing emails about our products and services."**

**3. Common Negation Patterns (if present):**

* **"We do not share your data with third parties."**
* **"We will not use your information for any purpose other than [specified purpose]."**


**Note:** These rules are based on a limited dataset and may not be exhaustive. It's crucial to consider 

In [11]:
def transform_rulebook(original_rulebook):
    new_rulebook = {}
    
    for section, content in original_rulebook.items():
        new_section = {"description": content.split("\n")[0].replace("## ", "")}
        
        key_phrases = []
        if "**1. Key Phrases (min 5):**" in content:
            start = content.find("**1. Key Phrases (min 5):**") + len("**1. Key Phrases (min 5):**")
            end = content.find("**2. Typical Sentence Structures (min 3):**")
            phrases_part = content[start:end].strip()
            key_phrases = [line.replace("*", "").strip() for line in phrases_part.split("\n") if line.strip()]
        
        new_section["key_phrases"] = key_phrases
        
        sentence_structures = []
        if "**2. Typical Sentence Structures (min 3):**" in content:
            start = content.find("**2. Typical Sentence Structures (min 3):**") + len("**2. Typical Sentence Structures (min 3):**")
            end = content.find("**3. Common Negation Patterns (if present):**") if "**3. Common Negation Patterns (if present):**" in content else content.find("**Note:**")
            structures_part = content[start:end].strip()
            sentence_structures = [line.replace("*", "").strip() for line in structures_part.split("\n") if line.strip()]
        
        new_section["sentence_structures"] = sentence_structures
        
        negation_patterns = []
        if "**3. Common Negation Patterns (if present):**" in content:
            start = content.find("**3. Common Negation Patterns (if present):**") + len("**3. Common Negation Patterns (if present):**")
            end = content.find("**Note:**")
            negations_part = content[start:end].strip()
            negation_patterns = [line.replace("*", "").strip() for line in negations_part.split("\n") if line.strip()]
        
        new_section["negation_patterns"] = negation_patterns
        
        note = ""
        if "**Note:**" in content:
            start = content.find("**Note:**") + len("**Note:**")
            note = content[start:].strip()
        
        new_section["note"] = note
        
        new_rulebook[section] = new_section
    
    return new_rulebook

In [12]:
def save_rulebook(rulebook, output_dir="result", filename="rulebook.json"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = os.path.join(output_dir, filename)
    with open(output_path, "w", encoding="utf-8") as json_file:
        json.dump(rulebook, json_file, indent=4, ensure_ascii=False)
    print(f"Rule book has been saved: {output_path}")

In [13]:
new_rulebook = transform_rulebook(rulebook)
save_rulebook(new_rulebook)

Rule book has been saved: result\rulebook.json
