In [None]:

!pip install spacy transformers nltk


!python -m spacy download en_core_web_sm


import spacy
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('punkt_tab')
nltk.download('stopwords')
try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model 'en_core_web_sm' loaded successfully!")
except OSError as e:
    print(f"Error loading SpaCy model: {e}")
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)


def summarize_text(text):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']


def analyze_risk(text):
    doc = nlp(text)
    risk_keywords = ["breach", "penalty", "fine", "risk", "litigation", "obligation", "liability"]
    risks = [sent.text for sent in doc.sents if any(keyword in sent.text.lower() for keyword in risk_keywords)]
    return risks


def process_legal_document(text):
    cleaned_text = preprocess_text(text)
    summary = summarize_text(cleaned_text)
    risks = analyze_risk(text)
    return summary, risks


legal_text = """
This agreement contains clauses about data protection and outlines the penalties for breach of contract.
The liability for failure to deliver services will be limited. Any litigation arising from this contract
must be addressed within the jurisdiction. Additional obligations include compliance with applicable laws
and risk management policies.
"""

summary, risks = process_legal_document(legal_text)

print("Summary of the Document:")
print(summary)
print("\nPotential Risks Identified:")
for risk in risks:
    print(f"- {risk}")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


SpaCy model 'en_core_web_sm' loaded successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


Summary of the Document:
 agreement contains clauses data protection outlines penalties breach contract liability failure deliver services. limited litigation arising contract must addressed within jurisdiction. additional obligations include compliance applicable laws risk management policies. contract must be signed by both parties by December 31, 2015.

Potential Risks Identified:
- This agreement contains clauses about data protection and outlines the penalties for breach of contract.

- The liability for failure to deliver services will be limited.
- Any litigation arising from this contract
must be addressed within the jurisdiction.
- Additional obligations include compliance with applicable laws
and risk management policies.



In [None]:

!pip install spacy transformers nltk textblob

!python -m spacy download en_core_web_sm

import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import nltk

nltk.download('punkt')
nltk.download('stopwords')

try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model 'en_core_web_sm' loaded successfully!")
except OSError as e:
    print(f"Error loading SpaCy model: {e}")

def preprocess_text(text):
    """Cleans and tokenizes the text."""
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

def summarize_text(text):
    """Summarizes the text using a Pegasus transformer model."""
    summarizer = pipeline("summarization", model="google/pegasus-xsum")
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']

def analyze_risk(text):
    """Identifies potential risks and provides sentiment analysis."""
    doc = nlp(text)
    risk_keywords = ["breach", "penalty", "fine", "risk", "litigation", "obligation", "liability", "compliance"]
    risks = [sent.text for sent in doc.sents if any(keyword in sent.text.lower() for keyword in risk_keywords)]

    risks_with_sentiment = []
    for risk in risks:
        sentiment = TextBlob(risk).sentiment
        risks_with_sentiment.append({
            "text": risk,
            "polarity": sentiment.polarity,
            "subjectivity": sentiment.subjectivity
        })

    return risks_with_sentiment

def extract_keywords(text):
    """Extracts key phrases using SpaCy's noun chunking."""
    doc = nlp(text)
    keywords = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
    return list(set(keywords))

def process_legal_document(text):
    """Processes the legal document for summarization and risk analysis."""
    cleaned_text = preprocess_text(text)

    summary = summarize_text(cleaned_text)

    risks = analyze_risk(text)

    keywords = extract_keywords(text)

    return summary, risks, keywords

legal_text = """
This agreement contains clauses about data protection and outlines the penalties for breach of contract.
The liability for failure to deliver services will be limited. Any litigation arising from this contract
must be addressed within the jurisdiction. Additional obligations include compliance with applicable laws
and risk management policies. Non-compliance may result in fines or other penalties. Risk mitigation strategies
are advised to ensure adherence to the contractual terms.
"""

summary, risks, keywords = process_legal_document(legal_text)

print("Summary of the Document:")
print(summary)

print("\nPotential Risks Identified (with Sentiment):")
for risk in risks:
    print(f"- {risk['text']}\n  Polarity: {risk['polarity']}, Subjectivity: {risk['subjectivity']}")

print("\nExtracted Keywords:")
print(", ".join(keywords))


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


SpaCy model 'en_core_web_sm' loaded successfully!


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


Summary of the Document:
The BBC News website takes a look at some of the key points of the UK's new Data Protection Act, which came into force on 1 July 2016... and what it means for you.

Potential Risks Identified (with Sentiment):
- This agreement contains clauses about data protection and outlines the penalties for breach of contract.

  Polarity: 0.0, Subjectivity: 0.0
- The liability for failure to deliver services will be limited.
  Polarity: -0.19404761904761908, Subjectivity: 0.22142857142857142
- Any litigation arising from this contract
must be addressed within the jurisdiction.
  Polarity: 0.0, Subjectivity: 0.0
- Additional obligations include compliance with applicable laws
and risk management policies.
  Polarity: 0.0, Subjectivity: 0.0
- Non-compliance may result in fines or other penalties.
  Polarity: -0.125, Subjectivity: 0.375
- Risk mitigation strategies
are advised to ensure adherence to the contractual terms.

  Polarity: 0.0, Subjectivity: 0.0

Extracted Keywor

In [None]:

!pip install spacy transformers nltk textblob PyPDF2


!python -m spacy download en_core_web_sm

import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import nltk
import PyPDF2

nltk.download('punkt')
nltk.download('stopwords')

try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model 'en_core_web_sm' loaded successfully!")
except OSError as e:
    print(f"Error loading SpaCy model: {e}")

def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file."""
    text = ""
    with open(pdf_file_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    """Cleans and tokenizes the text."""
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

def summarize_text(text):
    """Summarizes the text using a Pegasus transformer model."""
    summarizer = pipeline("summarization", model="google/pegasus-xsum")
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']

def analyze_risk(text):
    """Identifies potential risks and provides sentiment analysis."""
    doc = nlp(text)
    risk_keywords = ["breach", "penalty", "fine", "risk", "litigation", "obligation", "liability", "compliance"]
    risks = [sent.text for sent in doc.sents if any(keyword in sent.text.lower() for keyword in risk_keywords)]

    risks_with_sentiment = []
    for risk in risks:
        sentiment = TextBlob(risk).sentiment
        risks_with_sentiment.append({
            "text": risk,
            "polarity": sentiment.polarity,
            "subjectivity": sentiment.subjectivity
        })

    return risks_with_sentiment
def extract_keywords(text):
    """Extracts key phrases using SpaCy's noun chunking."""
    doc = nlp(text)
    keywords = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
    return list(set(keywords))

def process_legal_document_from_pdf(pdf_file_path):
    """Processes the legal document from a PDF file."""
    text = extract_text_from_pdf(pdf_file_path)

    cleaned_text = preprocess_text(text)

    summary = summarize_text(cleaned_text)

    risks = analyze_risk(text)

    keywords = extract_keywords(text)

    return summary, risks, keywords

pdf_file_path = "sample_legal_document.pdf"
try:
    summary, risks, keywords = process_legal_document_from_pdf(pdf_file_path)

    # Display results
    print("Summary of the Document:")
    print(summary)

    print("\nPotential Risks Identified (with Sentiment):")
    for risk in risks:
        print(f"- {risk['text']}\n  Polarity: {risk['polarity']}, Subjectivity: {risk['subjectivity']}")

    print("\nExtracted Keywords:")
    print(", ".join(keywords))
except Exception as e:
    print(f"An error occurred: {e}")



Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


SpaCy model 'en_core_web_sm' loaded successfully!
An error occurred: [Errno 2] No such file or directory: 'sample_legal_document.pdf'


In [None]:

!pip install spacy transformers nltk textblob PyPDF2

!python -m spacy download en_core_web_sm

import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import nltk
import PyPDF2
from ipywidgets import FileUpload
from io import BytesIO

nltk.download('punkt')
nltk.download('stopwords')

try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model 'en_core_web_sm' loaded successfully!")
except OSError as e:
    print(f"Error loading SpaCy model: {e}")

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    text = ""
    pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file))
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def preprocess_text(text):
    """Cleans and tokenizes the text."""
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

def summarize_text(text, chunk_size=512):
    """Summarizes the text using a Pegasus transformer model with chunking."""
    summarizer = pipeline("summarization", model="google/pegasus-xsum")

    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []

    for chunk in chunks:
        summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(summary[0]['summary_text'])

    full_summary = " ".join(summaries)
    return full_summary

def analyze_risk(text):
    """Identifies potential risks and provides sentiment analysis."""
    doc = nlp(text)
    risk_keywords = ["breach", "penalty", "fine", "risk", "litigation", "obligation", "liability", "compliance"]
    risks = [sent.text for sent in doc.sents if any(keyword in sent.text.lower() for keyword in risk_keywords)]

    risks_with_sentiment = []
    for risk in risks:
        sentiment = TextBlob(risk).sentiment
        risks_with_sentiment.append({
            "text": risk,
            "polarity": sentiment.polarity,
            "subjectivity": sentiment.subjectivity
        })

    return risks_with_sentiment

def extract_keywords(text):
    """Extracts key phrases using SpaCy's noun chunking."""
    doc = nlp(text)
    keywords = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
    return list(set(keywords))


def process_legal_document_from_pdf(pdf_file):
    """Processes the legal document from a PDF file."""

    text = extract_text_from_pdf(pdf_file)

    cleaned_text = preprocess_text(text)

    summary = summarize_text(cleaned_text)

    risks = analyze_risk(text)

    keywords = extract_keywords(text)

    return summary, risks, keywords

def upload_pdf():
    """Allows the user to upload a PDF file for processing."""
    uploader = FileUpload(accept=".pdf", multiple=False)
    display(uploader)


    uploader.observe(lambda change: process_uploaded_pdf(change, uploader), names='value')

def process_uploaded_pdf(change, uploader):
    """Processes the uploaded PDF and displays the results."""
    if uploader.value:

        uploaded_file = list(uploader.value.values())[0]
        pdf_file = uploaded_file['content']


        summary, risks, keywords = process_legal_document_from_pdf(pdf_file)

        print("Summary of the Document:")
        print(summary)

        print("\nPotential Risks Identified (with Sentiment):")
        for risk in risks:
            print(f"- {risk['text']}\n  Polarity: {risk['polarity']}, Subjectivity: {risk['subjectivity']}")

        print("\nExtracted Keywords:")
        print(", ".join(keywords))


upload_pdf()


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


SpaCy model 'en_core_web_sm' loaded successfully!


FileUpload(value={}, accept='.pdf', description='Upload')

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 150, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is 

Summary of the Document:
A daily guide to the key stories, newspaper headlines and quotes from the week in politics, business, sport and the arts from BBC News, BBC World News, BBC Radio 4 and BBC Radio 5 live. BBC News takes a look at some of the key stories from the past 24 hours in the world of politics and public life, with a focus on the UK's vote to leave the European Union. All terms and conditions of use are subject to change at any time by the BBC and/or its partners, unless otherwise agreed by the BBC and/or its partners, in which case all terms and conditions of use will cease to apply. A look at some of the key terms and phrases used on Google's social media sites, as well as some of the key features of the company's core services, such as search and Maps. Anti-spam software developed by Spamhaus, a Spamhaus company, is used by millions of people around the world to prevent spam emails from getting into their spam folders and into their email accounts. BBC Sport takes a loo