In [1]:
!pip install transformers sumy spaCy nltk PyPDF2 pdfminer.six torch spacy-llm PyMuPDF sentence_transformers rouge # Alternative PDF text extraction library
!python -m spacy download en_core_web_sm  # or a larger model if needed


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting spacy-llm
  Downloading spacy_llm-0.7.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# For text cleaning
import spacy  # SpaCy for NER and Relation Extraction
from transformers import T5ForConditionalGeneration, T5Tokenizer
import re
import fitz  # PyMuPDF

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,;:!?-]', '', text)  # Remove special characters
    return text

# Example usage
pdf_path = "/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.pdf"
text = extract_text_from_pdf(pdf_path)
cleaned_text = preprocess_text(text)


In [None]:
print(cleaned_text)

MASTER SUBCONTRACT AGREEMENT Page 1 of PMSC07_110 Rev. Aug 2017 15 A. B. C. D. E. F. 1. 2. 3. 4. 5. 1. 1. 2. That General Contractor and Subcontractor enter into this Agreement to facilitate future projects in which General Contractor may choose to retain Subcontractor. The scope of work and other project specific terms and conditions shall be set forth in a work order Work Order for each specific project. The Subcontract Work or Work shall include all labor, equipment, materials and services to be performed by Subcontractor, and all other obligations of Subcontractor required by the Subcontract Documents, as defined below. That General Contractor may retain Subcontractor from time to time on various constructions projects, as determined by General Contractor in its sole discretion pursuant to Work Orders to be executed in connection with each such project. That this Agreement shall apply to all future Work to be provided by Subcontractor on any project pursuant to such Work Orders, pr

In [None]:
from transformers import pipeline

# Load Hugging Face NER model
hf_ner_model = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

def perform_ner_with_hf_model(paragraph, model):
    return model(paragraph)

# Process each paragraph with the NER model
def ners_from_paragraph(cleaned_text):
    all_ner_results = []
    for paragraph in cleaned_text:
        ner_results = perform_ner_with_hf_model(paragraph, hf_ner_model)
        all_ner_results.append(ner_results)
    return all_ner_results

# Process paragraphs and print results
paragraph_results = ners_from_paragraph(cleaned_text)
for i, paragraph_ner in enumerate(paragraph_results):
    print(f"\nParagraph {i+1} entities:")
    for entity in paragraph_ner:
        print(f"Entity: {entity['word']}, Label: {entity['entity_group']}")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

In [4]:
import fitz  # PyMuPDF
import re
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,;:!?-]', '', text)  # Remove special characters
    return text

# Summarize using vector embeddings and a custom prompt
def summarize_with_prompt(pdf_path, custom_prompt, num_sentences=5):
    # Load and preprocess text
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = preprocess_text(text)

    # Split text into sentences
    sentences = cleaned_text.split('. ')

    # Load Sentence-BERT model
    model = SentenceTransformer('all-MiniLM-L6-v2') # all-MiniLM-L6-v2 or all-mpnet-base-v2

    # Generate embeddings for sentences and prompt
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    prompt_embedding = model.encode(custom_prompt, convert_to_tensor=True)

    # Compute similarity of each sentence to the prompt
    similarities = util.pytorch_cos_sim(prompt_embedding, sentence_embeddings)[0]

    # Select top-n sentences with highest similarity
    top_sentence_indices = similarities.topk(num_sentences)[1]
    summary_sentences = [sentences[idx] for idx in top_sentence_indices]

    # Combine selected sentences into summary
    summary = '. '.join(summary_sentences)
    return summary

# Example usage
pdf_path = "/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.pdf"
custom_prompt = "Please provide a comprehensive summary of the supply chain management processes, focusing on procurement, production, logistics, inventory management, and distribution. For each component, describe its role and significance, and address common challenges and strategic impacts. In procurement, include aspects such as supplier selection and contract management, and discuss challenges like supplier reliability and cost fluctuations. In production, highlight manufacturing processes and quality control, along with challenges such as downtime and quality issues. For logistics, cover transportation and warehousing, addressing delays and inefficiencies, and their impact on cost reduction and delivery performance. In inventory management, focus on stock optimization and demand forecasting, noting challenges like stockouts and overstocking and their effects on carrying costs and cash flow. Discuss distribution processes, including order fulfillment and delivery, and the associated challenges and impacts on customer service and market reach. Additionally, summarize risk management strategies, including risk assessment and mitigation practices, and the challenges related to disruptions and regulatory changes. Highlight the importance of compliance with regulations and standards, discussing challenges such as regulatory changes and documentation requirements, and their impact on operational legitimacy. Describe sustainability practices, focusing on environmental impact reduction and ethical sourcing, and the challenges of balancing cost and sustainability. Discuss the implications of technological integration, including automation and data analytics, and the challenges related to adoption and cybersecurity. Finally, address key legal considerations such as insurance, penalties, obligations, change of work, indemnification, default, and termination clauses. This summary should distill all critical points, including roles, challenges, impacts, and legal considerations, into a cohesive overview."
summary = summarize_with_prompt(pdf_path, custom_prompt, num_sentences=40)
print("Summary:", summary)


Summary: The General Contractor has the right to contact Subcontractors suppliers or vendors and visit the office, shops, and yards of the Subcontractor and his suppliers and vendors in order to verify compliance with the schedule. All shipping and delivery costs are included and Subcontractor shall provide a minimum 48-hour advance notice to the General Contractor for all deliveries. Subcontractor is responsible for providing the equipment and labor necessary to unload, store, and distribute its material, inclusive of any F.O.B. Subcontractor is responsible to maintain strict compliance with the General Contractors gate usage procedures for his labor, suppliers, and Sub-Subcontractors. Subcontractor shall enforce discipline and good order among its employees, suppliers, and Sub-Subcontractors engaged in the work. Updated Material Procurement Log. Waste which cannot be identified as being attributable to a specific trade food wrappings, beverage containers, newspaper, general packing m

In [7]:
# Latest working model that works with custom word embeddings -- 30th August


import fitz  # PyMuPDF
import re
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,;:!?-]', '', text)  # Remove special characters
    return text

# Summarize using vector embeddings and a custom prompt
def summarize_with_prompt(pdf_path, custom_prompt, num_sentences=5, output_file="summary.txt"):
    # Load and preprocess text
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = preprocess_text(text)

    # Split text into sentences
    sentences = cleaned_text.split('. ')

    # Handle case where there are not enough sentences
    if len(sentences) < num_sentences:
        num_sentences = len(sentences)

    # If no valid sentences found, return an empty summary
    if not sentences or num_sentences == 0:
        return "No valid sentences found for summarization."

    # Load Sentence-BERT model
    model = SentenceTransformer('all-mpnet-base-v2') # all-MiniLM-L6-v2 or all-mpnet-base-v2 or all-MiniLM-L12-v2

    # Generate embeddings for sentences and prompt
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    prompt_embedding = model.encode(custom_prompt, convert_to_tensor=True)

    # Compute similarity of each sentence to the prompt
    similarities = util.pytorch_cos_sim(prompt_embedding, sentence_embeddings)[0]

    # Select top-n sentences with highest similarity
    top_sentence_indices = similarities.topk(num_sentences)[1]
    summary_sentences = [sentences[idx] for idx in top_sentence_indices]

    # Combine selected sentences into summary
    summary = '. '.join(summary_sentences)

    # Save the summary to a text file
    with open(output_file, "w") as file:
        file.write(summary)

    return summary

# Example usage
pdf_path = "/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.pdf"
custom_prompt = "Summarize the key elements and critical processes involved in the supply chain, including procurement, production, logistics, inventory management, and distribution. The summary should cover potential challenges, efficiencies, and strategic impacts, while also addressing any penalties, termination clauses, legal considerations, and intricate details that might affect operations. Ensure that the summary captures the essence of all sections, including risk management strategies, compliance with regulations, sustainability practices, and the implications of technological integration. The final summary should distill all pages of content into a comprehensive overview, retaining all critical points and nuances."
output_file = "/content/new_contract_summary_all-mpnet-base-v2.txt"
print("/content/summary_miniLLM_L12.txt")
summary = summarize_with_prompt(pdf_path, custom_prompt, num_sentences=25, output_file=output_file)
print("Summary saved to:", output_file)

/content/summary_miniLLM_L12.txt


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Summary saved to: /content/new_contract_summary_all-mpnet-base-v2.txt


In [12]:
import fitz  # PyMuPDF
import re
import torch
from sentence_transformers import SentenceTransformer, util

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s.,;:!?-]', '', text)
    return text

def assign_positional_weights(sentences, max_weight=1.5, min_weight=0.5):
    num_sentences = len(sentences)
    weights = []
    for i in range(num_sentences):
        if i < num_sentences * 0.1 or i > num_sentences * 0.9:
            weights.append(max_weight)
        else:
            weight = min_weight + (max_weight - min_weight) * (1 - abs(i - num_sentences/2) / (num_sentences/2))
            weights.append(weight)
    return weights

def extract_structure(pdf_path):
    doc = fitz.open(pdf_path)
    structure = []
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if block["type"] == 0:  # Text block
                for line in block["lines"]:
                    for span in line["spans"]:
                        if span["size"] > 12:  # Assuming larger font size indicates headings
                            structure.append({"type": "heading", "text": span["text"], "page": page.number})
                        else:
                            structure.append({"type": "text", "text": span["text"], "page": page.number})
    return structure

def summarize_with_structure(pdf_path, custom_prompt, num_sentences=40, output_file="summary.txt"):
    structure = extract_structure(pdf_path)

    # Group text by sections
    sections = []
    current_section = {"heading": "", "text": ""}
    for item in structure:
        if item["type"] == "heading":
            if current_section["text"]:
                sections.append(current_section)
            current_section = {"heading": item["text"], "text": ""}
        else:
            current_section["text"] += item["text"] + " "
    if current_section["text"]:
        sections.append(current_section)

    # Load Sentence-BERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Check if CUDA (GPU) is available and set the device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Summarize each section
    section_summaries = []
    total_sentences = sum(len(preprocess_text(section["text"]).split('. ')) for section in sections)

    for section in sections:
        cleaned_text = preprocess_text(section["text"])
        sentences = cleaned_text.split('. ')

        sentence_embeddings = model.encode(sentences, convert_to_tensor=True).to(device)
        prompt_embedding = model.encode(custom_prompt, convert_to_tensor=True).to(device)
        similarities = util.pytorch_cos_sim(prompt_embedding, sentence_embeddings)[0]

        # Apply positional weighting
        positional_weights = assign_positional_weights(sentences)
        weighted_similarities = similarities * torch.tensor(positional_weights).to(device)

        # Select top sentences for this section
        section_size = max(1, int(num_sentences * len(sentences) / total_sentences))
        top_sentence_indices = weighted_similarities.topk(section_size)[1]
        section_summary = '. '.join([sentences[idx] for idx in top_sentence_indices])
        section_summaries.append(f"{section['heading']}: {section_summary}")

    # Combine section summaries
    final_summary = '\n\n'.join(section_summaries)

    # Save the summary
    with open(output_file, "w") as file:
        file.write(final_summary)

    return final_summary

# Example usage
pdf_path = "/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.pdf"
custom_prompt = "Summarize the key elements and critical processes involved in the supply chain, including procurement, production, logistics, inventory management, and distribution. The summary should cover potential challenges, efficiencies, and strategic impacts, while also addressing any penalties, termination clauses, legal considerations, and intricate details that might affect operations. Ensure that the summary captures the essence of all sections, including risk management strategies, compliance with regulations, sustainability practices, and the implications of technological integration. The final summary should distill all pages of content into a comprehensive overview, retaining all critical points and nuances."
output_file = "/content/new_contract_summary_structured.txt"

summary = summarize_with_structure(pdf_path, custom_prompt, num_sentences=25, output_file=output_file)
print("Summary saved to:", output_file)


Summary saved to: /content/new_contract_summary_structured.txt


In [18]:
import fitz  # PyMuPDF
import re
import torch
from sentence_transformers import SentenceTransformer, util

# ... (keep the previous functions as they are) ...

def summarize_with_structure(pdf_path, custom_prompt, sentences_per_section=8, output_file="summary.txt"):
    structure = extract_structure(pdf_path)

    # Group text by sections and sub-paragraphs
    sections = []
    current_section = {"heading": "", "paragraphs": []}
    current_paragraph = ""

    for item in structure:
        if item["type"] == "heading":
            if current_section["paragraphs"]:
                sections.append(current_section)
            current_section = {"heading": item["text"], "paragraphs": []}
            if current_paragraph:
                current_section["paragraphs"].append(current_paragraph)
                current_paragraph = ""
        else:
            current_paragraph += item["text"] + " "
            if len(current_paragraph.split()) > 100:  # Assume a new paragraph after 100 words
                current_section["paragraphs"].append(current_paragraph)
                current_paragraph = ""

    if current_paragraph:
        current_section["paragraphs"].append(current_paragraph)
    if current_section["paragraphs"]:
        sections.append(current_section)

    # Load Sentence-BERT model
    model = SentenceTransformer('hkunlp/instructor-large')

    # Check if CUDA (GPU) is available and set the device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Summarize each section and its paragraphs
    section_summaries = []

    for section in sections:
        section_summary = [f"## {section['heading']}"]

        for i, paragraph in enumerate(section['paragraphs']):
            cleaned_text = preprocess_text(paragraph)
            sentences = cleaned_text.split('. ')

            if len(sentences) < 2:
                continue  # Skip very short paragraphs

            sentence_embeddings = model.encode(sentences, convert_to_tensor=True).to(device)
            prompt_embedding = model.encode(custom_prompt, convert_to_tensor=True).to(device)
            similarities = util.pytorch_cos_sim(prompt_embedding, sentence_embeddings)[0]

            # Apply positional weighting
            positional_weights = assign_positional_weights(sentences)
            weighted_similarities = similarities * torch.tensor(positional_weights).to(device)

            # Select top sentences for this paragraph
            num_sentences = min(sentences_per_section, len(sentences))
            top_sentence_indices = weighted_similarities.topk(num_sentences)[1]
            paragraph_summary = '. '.join([sentences[idx] for idx in top_sentence_indices])

            section_summary.append(f"### Sub-paragraph {i+1}\n{paragraph_summary}")

        section_summaries.append('\n\n'.join(section_summary))

    # Combine section summaries
    final_summary = '\n\n'.join(section_summaries)

    # Save the summary
    with open(output_file, "w") as file:
        file.write(final_summary)

    return final_summary

# Example usage
pdf_path = "/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.pdf"
custom_prompt = "Summarize the key elements and critical processes involved in the supply chain, including procurement, production, logistics, inventory management, and distribution. The summary should cover potential challenges, efficiencies, and strategic impacts, while also addressing any penalties, termination clauses, legal considerations, and intricate details that might affect operations. Ensure that the summary captures the essence of all sections, including risk management strategies, compliance with regulations, sustainability practices, and the implications of technological integration. The final summary should distill all pages of content into a comprehensive overview, retaining all critical points and nuances."
output_file = "/content/new_contract_summary_structured_detailed_mpnet_concise.txt"

summary = summarize_with_structure(pdf_path, custom_prompt, sentences_per_section=3, output_file=output_file)
print("Detailed summary saved to:", output_file)

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


Detailed summary saved to: /content/new_contract_summary_structured_detailed_mpnet_concise.txt
