<a href="https://colab.research.google.com/github/RohithJ11/NLP_Privacy_Policies/blob/main/BERTmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets torch



In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.30.1


In [None]:
pip install transformers[torch] -U



In [None]:
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("RohithJ/cleaned_dataset_prvy_plcy")

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the data
def preprocess_function(examples):
    inputs = tokenizer(examples['Content'], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples['Summary_of_Content'], padding="max_length", truncation=True, max_length=128)
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'decoder_input_ids': outputs['input_ids'],
        'labels': outputs['input_ids'],
    }

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Initialize BERT-to-BERT model for sequence-to-sequence summarization
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

print(f"decoder_start_token_id: {model.config.decoder_start_token_id}")
print(f"pad_token_id: {model.config.pad_token_id}")
print(f"bos_token_id: {model.config.bos_token_id}")
print(f"eos_token_id: {model.config.eos_token_id}")


# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save model and tokenizer
model_save_path = './bert_summarization_model3'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

decoder_start_token_id: 101
pad_token_id: 0
bos_token_id: None
eos_token_id: 102




Epoch,Training Loss,Validation Loss
1,No log,0.60986
2,No log,0.319807
3,No log,0.223393


('./bert_summarization_model3/tokenizer_config.json',
 './bert_summarization_model3/special_tokens_map.json',
 './bert_summarization_model3/vocab.txt',
 './bert_summarization_model3/added_tokens.json')

In [None]:
pip install transformers torch




In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import re
import torch

# Function to load the model and tokenizer
def load_model(model_path):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = BartTokenizer.from_pretrained(model_path)
    return model, tokenizer

# Function to clean the input text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Reduce whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Function to generate the summary
def summarize(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=150,
        num_beams=5,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to summarize the given text
def summarize_text(model_path, text):
    model, tokenizer = load_model(model_path)
    cleaned_text = clean_text(text)
    summary = summarize(cleaned_text, model, tokenizer)
    return summary

# Example Usage
model_path = 'facebook/bart-large-cnn'  # Use a pretrained BART model or your fine-tuned BART model path
text = """The Department’s Bureau of Global Public Affairs (GPA) uses the GovDelivery service to deliver email bulletin messages to self-subscribed users.  GPA’s Office of Global Web Platforms serves as the executive agent for the Department’s GovDelivery Service and controls who at the Department has access to send email bulletins, create or delete topics.  GovDelivery is a web-based e-mail subscription management system that allows a member of the public (user) to subscribe to news and information on www.state.gov. The GovDelivery user selects specific topics that interest them. Whenever information on that topic is made available by the Department, the user that has subscribed to that topic receives an email.  The user’s subscription profile consists of their email address and the topics they wish to receive email updates for.  The user may customize and manage their subscription profile in order to receive exactly the types of information they desire, and they may cancel their subscriptions at any time.  Users engaging the Department’s GovDelivery system expect privacy protections while interacting with the Department. We will only use the email addresses provided by the users to send email messages related to the topics selected by the user in the GovDelivery system. We will not use the GovDelivery service to: 1) send email messages not related to the topics selected by the user; 2) actively seek personally identifiable information; and 3) search for or by personally identifiable information without a waiver from our Privacy Office.  To the extent a user posts or sends personally identifiable information to the Department’s GovDelivery system, we will use the minimum amount necessary to accomplish a purpose authorized by statute, executive order, or regulation.  Neither the Department nor GovDelivery may share a user’s subscription profile (including email address) without a waiver from the Privacy Office."""
summary = summarize_text(model_path, text)
print("Generated Summary:", summary)




config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generated Summary: The Departments Bureau of Global Public Affairs uses the GovDelivery service to deliver email bulletin messages to selfsubscribed users GPAs Office of Global Web Platforms serves as the executive agent for the Departments GovDelivery Service and controls who at the Department has access to send email bulletins create or delete topics GovDelivery is a webbased email subscription management system that allows a member of the public user to subscribe to news and information.


#ProphetNet