In [41]:
!pip install transformers torch pypdf




In [42]:
from google.colab import files

uploaded = files.upload()


Saving google_privacy_policy_en.pdf to google_privacy_policy_en (2).pdf


In [43]:
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

pdf_path = list(uploaded.keys())[0]
policy_text = extract_text_from_pdf(pdf_path)

print(policy_text[:1000])  # preview

# This cell defines a function to extract text from the uploaded PDF file using `pypdf`.
# It then calls this function with the uploaded PDF and prints the first 1000 characters of the extracted text as a preview.



GOOGLE PRIVACY POLICY
When you use our services, you’re trusting us
with your information. We understand this is a big
responsibility and work hard to protect your
information and put you in control.
This Privacy Policy is meant to help you understand what information we collect, why we
collect it, and how you can update, manage, export, and delete your information.
Privacy Checkup
Looking to change your privacy settings?
Take the Privacy Checkup
Effective July 1, 2025 | Archived versions
We build a range of services that help millions of people daily to explore and interact with
the world in new ways. Our services include:
Google apps, sites, and devices, like Search, YouTube, and Google Home
Platforms like the Chrome browser and Android operating system
Products that are integrated into third-party apps and sites, like ads, analytics, and
embedded Google Maps
You can use our services in a variety of ways to manage your privacy. For example, you
can sign up for a Google Account if you

In [44]:
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

policy_text = clean_text(policy_text)
print(len(policy_text))

# This cell defines a `clean_text` function using regular expressions to remove extra newlines and spaces from the extracted PDF text.
# It then applies this cleaning to `policy_text` and prints the length of the cleaned text.


52639


In [45]:
from transformers import pipeline

model_name = "sshleifer/distilbart-cnn-12-6"  # much faster, same task

summarizer = pipeline(
    "summarization",
    model=model_name,
    device=-1
)

# This cell initializes a summarization pipeline from the transformers library, loading the sshleifer/distilbart-cnn-12-6 model.
# It also explicitly sets the device to CPU (device=-1) to avoid potential GPU memory issues.


Device set to use cpu


In [46]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(policy_text)
print("Total chunks:", len(chunks))

# This cell defines a chunk_text function to split the long policy_text into smaller chunks of 500 words each.
# This is necessary because summarization models have a maximum input length. It then prints the total number of chunks created.

Total chunks: 18


In [47]:
def adaptive_lengths(text):
    words = len(text.split())

    max_len = max(20, min(60, words // 2))
    min_len = max(10, max_len // 2)

    return max_len, min_len

    # This cell defines a helper function `adaptive_lengths` that calculates dynamic `max_length` and `min_length` for the summarizer based on the word count of each text chunk.
    # This helps generate more relevant summaries for varying chunk sizes.


In [48]:
summaries = []

for chunk in chunks:
    if len(chunk.strip()) < 50:
        continue

    max_len, min_len = adaptive_lengths(chunk)

    out = summarizer(
        chunk,
        max_length=max_len,
        min_length=min_len,
        do_sample=False,
        truncation=True
    )

    summaries.append(out[0]["summary_text"])

    # This cell iterates through each `chunk` of the `policy_text`. For each chunk, it uses the `summarizer` pipeline to generate a summary with adaptive `max_length` and `min_length` calculated by the `adaptive_lengths` function.
    # These individual chunk summaries are then collected into the `summaries` list.


In [49]:
import re

sentences = re.split(r'(?<=[.!?])\s+', final_summary)

print("==== Privacy Policy TL;DR ====\n")
for i in range(0, len(sentences), 3):
    print(" ".join(sentences[i:i+3]))
    print()
# This cell takes the final_summary, splits it into sentences, and then prints it out in a more readable 'TL;DR' format.


==== Privacy Policy TL;DR ====

The Universal Declaration of Human Rights states that people have rights to both privacy and to safety . But discourse around privacy, speech, and safety can sometimes pit these values against each other . For people who use Facebook, Instagram, WhatsApp and Messenger, individual privacy protections must coexist alongside the voice Meta’s Privacy Review offers a process to analyze privacy alongside other safety, security, and integrity concerns .

The regulatory environment for privacy, free speech, and safety is shifting . We hope this format creates an open dialogue to discuss what people want out of new and existing Meta services . Meta is committed to reducing bad experiences on our services .

The expansion of digital spaces in which we increasingly interact have created new opportunities for bad actors to exploit peoples’ safety, security, and well-being online . For Meta and others that want to decrease these negative experiences online, we The re

In [50]:
def word_count(text):
    return len(text.split())

def adaptive_summary_limit(input_text):
    wc = word_count(input_text)

    if wc < 500:
        limit = int(wc * 0.25)
    elif wc < 2000:
        limit = int(wc * 0.18)
    else:
        limit = int(wc * 0.10)

    # safety bounds
    limit = max(120, min(limit, 600))
    return limit
# This cell defines `word_count` and `adaptive_summary_limit` functions. `adaptive_summary_limit` determines an appropriate word limit for the overall final summary based on the length of the original `policy_text`.


In [51]:
max_words = adaptive_summary_limit(policy_text)

final_summary = " ".join(summaries)
final_summary = " ".join(final_summary.split()[:max_words])

# This cell first calculates the `max_words` for the final summary using `adaptive_summary_limit`. It then concatenates all individual chunk summaries into a single `final_summary` string and truncates it to the calculated `max_words`.


In [52]:
import re

KEYWORDS = [
    "collect", "data", "information", "share", "third party",
    "location", "device", "cookies", "retain", "store",
    "use", "advertising", "analytics"
]

def adaptive_bullet_count(text):
    words = len(text.split())
    return min(20, max(8, words // 50))

def to_bullets(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Prefer sentences with privacy keywords
    important = [
        s for s in sentences
        if any(k in s.lower() for k in KEYWORDS)
    ]

    max_points = adaptive_bullet_count(text)

    selected = important[:max_points] if important else sentences[:max_points]

    return "\n".join(f"- {s.strip()}" for s in selected)

# This cell defines `KEYWORDS` relevant to data privacy and two functions: `adaptive_bullet_count` (to decide how many bullet points to generate) and `to_bullets`
# (to convert a given text into a bulleted list,prioritizing sentences containing the defined keywords).

In [53]:
data_collection_bullets = to_bullets(final_summary)
print("\n==== WHAT DATA DO THEY COLLECT? ====\n")
print(data_collection_bullets)

#This cell uses the `to_bullets` function on the `final_summary` to generate a bulleted list of key points related to data collection. It then prints this bulleted list under a clear heading.



==== WHAT DATA DO THEY COLLECT? ====

- This Privacy Policy is meant to help you understand what information we collect, why we collect it, and how you can update, manage, export, and delete your information .
- We collect information to provide better services to all our users .
- The information Google collects depends on how you use our services We collect information about the apps, browsers, and devices you use to access Google services .
- We also collect the content you create, upload, or receive from others when using our services .
- This includes things like email you write and receive, photos and videos you save, docs and spreadsheets Location data we collect depends in part on device and account settings .
- Location data includes GPS and other sensor data from your device IP address .
- Google also collects information about you from publicly accessible sources .
- We use data to build better services .
- You can control what information we use to show you ads by visiting