In [2]:
import os
import json
import re

from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from openai import OpenAI

Classify privacy policy documents based on 12 categories.

In [3]:
# Load ChatGPT API key
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

os.environ["OPENAI_API_KEY"] = "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"
os.environ["SERPAPI_API_KEY"] = "70a08d7f2b16602366468c3df268fc9f7f16f52c4020d138931f0c387363799e"

In [None]:
# Load prompts which export from paper
prompt_cols = ["First Party Collection/Use","Third Party Sharing/Collection","User Access Edit and Deletion",
              "Data Retention","Data Security","International and Specific Audiences","Do Not Track","Policy Change",
              "User Choice/Control","Introductory/Generic","Practice not covered","Privacy contact information"]

Test if AI available 

In [None]:
model = init_chat_model("gpt-4o-mini", model_provider="openai")
response = model.invoke([HumanMessage(content="hi!")])
response.content

'Hello! How can I assist you today?'

In [None]:
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": "Write a one-sentence bedtime story about a unicorn."
        }
    ]
)

print(completion.choices[0].message.content)

Under the twinkling starlit sky, Luna the unicorn gently trotted through the shimmering forest, her mane sparkling like a thousand tiny galaxies, as she silently wished sweet dreams upon every sleeping creature she passed.


Load privacy policy files

In [4]:
# Store privacy policy texts in the dictionary
def load_text_files(folder_path):
    file_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") and not filename.endswith("_real_url.txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                file_dict[filename] = file.read()
    return file_dict

In [5]:
folder_path = "companies_privacy_policy/"
load_text_files(folder_path)

{'privacy_policy_appletext.txt': '\n\n  * [Apple](/)\n  *     * [Store](/us/shop/goto/store)\n    * ## Shop\n\n    * [Shop the Latest](/us/shop/goto/store)\n    * [Mac](/us/shop/goto/buy_mac)\n    * [iPad](/us/shop/goto/buy_ipad)\n    * [iPhone](/us/shop/goto/buy_iphone)\n    * [Apple Watch](/us/shop/goto/buy_watch)\n    * [Apple Vision Pro](/us/shop/goto/buy_vision)\n    * [Accessories](/us/shop/goto/buy_accessories)\n\n## Quick Links\n\n    * [Find a Store](/retail/)\n    * [Order Status](/us/shop/goto/order/list)\n    * [Apple Trade In](/us/shop/goto/trade_in)\n    * [Financing](/us/shop/goto/payment_plan)\n    * [Personal Setup](/us/shop/goto/personal_setup)\n\n## Shop Special Stores\n\n    * [Certified Refurbished](/us/shop/goto/special_deals)\n    * [Education](/us/shop/goto/educationrouting)\n    * [Business](/retail/business/)\n    * [Veterans and Military](/us/shop/goto/eppstore/veteransandmilitary)\n    * [Government](/r/store/government/)\n\n    * [Mac](/mac/)\n    * ## Expl

Clean generated JSON format

In [6]:
def clean_json_string(json_str):
    json_str = re.sub(r"```json\s*", "", json_str)  
    json_str = re.sub(r"```", "", json_str)  
    return json_str.strip()  

Process privacy policy

In [12]:
def process_privacy_policy(privacy_text_dict):
    client = OpenAI()
    results = {}
    for text_name, privacy_text in privacy_text_dict.items():
        # identify prompt
        prompt = f"""
Below is a privacy policy text:

{privacy_text}

Please classify this privacy policy based on the following 12 categories. Multiple categories may apply:

1. **First Party Collection/Use** - how and why the information is collected.
2. **Third Party Sharing/Collection** - how the information may be used or collected by third parties.
3. **User Access/Edit/Deletion** - if users can modify their information and how.
4. **Data Retention** - how long the information is stored.
5. **Data Security** - how is users’ data secured.
6. **International/Specific Audiences** - practices that target a specific group of users (e.g., children, Europeans, etc.)
7. **Do Not Track** - if and how Do Not Track signals is honored.
8. **Policy Change** - if the service provider will change their policy and how the users are informed.
9. **User Choice/Control** - choices and controls available to to users.
10. **Introductory/Generic** - Does it contain general or introductory information about the privacy policy?
11. **Practice not covered** - Does it mention any privacy practices not covered by the above categories?
12. **Privacy contact information** - Does it provide contact information for users to inquire about privacy-related issues?

### **Instructions**
- Extract relevant sections from the policy and classify them under the appropriate categories.
- If a section belongs to multiple categories, list all applicable categories.
- Return the output as a JSON object, where each category contains a list of text excerpts from the policy.

### **Example Output Format**
{{
    "First Party Collection/Use": ["We collect user data to improve our services..."],
    "Third Party Sharing/Collection": ["We may share your data with third-party advertisers..."],
    "User Choice/Control": ["Users can opt out of data collection by adjusting their settings..."],
    "User Access/Edit/Deletion": ["You can update or delete your information by contacting support..."],
    "Data Retention": ["We retain user data for up to 2 years..."],
    "Data Security": ["We implement encryption and access control to protect data..."],
    "Policy Change": ["We may update this policy, and users will be notified via email..."],
    "Do Not Track": ["We respect Do Not Track signals in supported browsers..."],
    "International/Specific Audiences": ["Our service complies with GDPR for European users..."]
}}
"""
        # classify the privacy policy text
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        result = completion.choices[0].message.content
        print(f"Classification for {text_name}: {result}")
        
        # parse the JSON result
        try:
            clean_result = clean_json_string(result) 
            result_json = json.loads(clean_result)
        except json.JSONDecodeError:
            print(f"Failed to parse JSON for {text_name}, raw response: {result}")
            result_json = {"error": "Invalid JSON response from API"}

        print(f"Classification for {text_name}: {json.dumps(result_json, indent=4)}")
        results[text_name] = result_json

        # save the results to a JSON file
        output_dir = "result"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        base_name = os.path.splitext(text_name)[0]

        output_filename = os.path.join(output_dir, f"{base_name}_category_result.json")
        with open(output_filename, "w", encoding="utf-8") as json_file:
            json.dump(result_json, json_file, indent=4, ensure_ascii=False)

        print(f"Results saved to {output_filename}")
        
    return results

Run functions to process companies' privacy policy

In [13]:
folder_path = "companies_privacy_policy/"
privacy_text_dict = load_text_files(folder_path)
process_privacy_policy(privacy_text_dict)

Classification for privacy_policy_appletext.txt: ```json
{
    "First Party Collection/Use": [
        "Apple uses personal data to power our services, to process your transactions, to communicate with you, for security and fraud prevention, and to comply with law.",
        "Apple strives to collect only the personal data that we need."
    ],
    "Third Party Sharing/Collection": [
        "Apple may share personal data with Apple-affiliated companies, service providers who act on our behalf, our partners, developers, and publishers, or others at your direction.",
        "Apple does not share personal data with third parties for their own marketing purposes."
    ],
    "User Access/Edit/Deletion": [
        "At Apple, we respect your ability to know, access, correct, transfer, restrict the processing of, and delete your personal data.",
        "To exercise your privacy rights and choices, including where a third-party service provider is acting on Apple’s behalf, visit the Apple D

{'privacy_policy_appletext.txt': {'First Party Collection/Use': ['Apple uses personal data to power our services, to process your transactions, to communicate with you, for security and fraud prevention, and to comply with law.',
   'Apple strives to collect only the personal data that we need.'],
  'Third Party Sharing/Collection': ['Apple may share personal data with Apple-affiliated companies, service providers who act on our behalf, our partners, developers, and publishers, or others at your direction.',
   'Apple does not share personal data with third parties for their own marketing purposes.'],
  'User Access/Edit/Deletion': ['At Apple, we respect your ability to know, access, correct, transfer, restrict the processing of, and delete your personal data.',
   'To exercise your privacy rights and choices, including where a third-party service provider is acting on Apple’s behalf, visit the Apple Data and Privacy page.'],
  'Data Retention': ['Apple retains personal data only for s