In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
%pip install sentence-transformers scikit-learn numpy python-docx rouge_score



In [9]:
from docx import Document

def load_document_from_file(file_path):
    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        document = '\n'.join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format. Please provide a .txt or .docx file.")

    return document

# Example usage:
file_path = '/content/drive/MyDrive/SPC_Assist/Contracts/New_contract.docx'  # Replace with your document path
document = load_document_from_file(file_path)


In [10]:
import re

def preprocess_document(document):
    sentences = document.split('.')  # Simple sentence splitting by period

    processed_sentences = []
    for sentence in sentences:
        # Remove special characters and extra whitespace, convert to lowercase
        sentence_cleaned = re.sub(r'[^\w\s]', '', sentence).strip().lower()
        processed_sentences.append(sentence_cleaned)

    return [sentence for sentence in processed_sentences if sentence]  # Remove empty sentences

# Preprocess the loaded document
sentences = preprocess_document(document)


In [None]:
from sentence_transformers import SentenceTransformer

# Define constants
MODEL_NAME = 'all-MiniLM-L12-v2'  # or use 'all-MiniLM-L12-v2' or 'all-mpnet-base-v2'

# Load the SentenceTransformer model
model = SentenceTransformer(MODEL_NAME)

# Custom Prompt Creation
prompt = "Summarize the key elements and critical processes involved in the supply chain, including procurement, production, logistics, inventory management, and distribution. The summary should cover potential challenges, efficiencies, and strategic impacts, while also addressing any penalties, termination clauses, legal considerations, and intricate details that might affect operations. Ensure that the summary captures the essence of all sections, including risk management strategies, compliance with regulations, sustainability practices, and the implications of technological integration. The final summary should distill all pages of content into a comprehensive overview, retaining all critical points and nuances."

# Encode the custom prompt
prompt_embedding = model.encode(prompt, convert_to_tensor=True)

# Encode the preprocessed sentences
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)


  from tqdm.autonotebook import tqdm, trange


In [None]:
import numpy as np
from sentence_transformers import util

def compute_similarity_and_rank(prompt_embedding, sentence_embeddings, sentences):
    # Compute cosine similarity between the prompt embedding and sentence embeddings
    similarities = util.pytorch_cos_sim(prompt_embedding, sentence_embeddings)[0]

    # Move similarities tensor to CPU and convert to numpy array
    similarities = similarities.cpu().numpy()

    # Sort the indices in descending order of similarity scores
    ranked_indices = np.argsort(-similarities).tolist()  # Negate similarities to get descending order

    # Pair the ranked sentences with their similarity scores
    ranked_sentences = [(sentences[idx], similarities[idx]) for idx in ranked_indices]

    return ranked_sentences

# Compute similarities and rank sentences
ranked_sentences = compute_similarity_and_rank(prompt_embedding, sentence_embeddings, sentences)


In [None]:
# Adjust similarity scores with positional bias

def apply_positional_bias(ranked_sentences, decay_factor=0.9):
    for i, (sentence, score) in enumerate(ranked_sentences):
        positional_weight = decay_factor ** i
        ranked_sentences[i] = (sentence, score * positional_weight)

    return sorted(ranked_sentences, key=lambda x: x[1], reverse=True)

# Apply positional bias to ranked sentences
ranked_sentences_with_bias = apply_positional_bias(ranked_sentences)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def remove_redundancy(ranked_sentences, sentence_embeddings, threshold=0.8):
    """
    Remove redundant sentences based on cosine similarity between top-ranked sentences.
    """
    selected_sentences = []
    selected_embeddings = []

    for sentence, score in ranked_sentences:
        if selected_sentences:
            # Compute similarity with already selected sentences
            # Move the tensor to CPU using .cpu() and then convert it to a NumPy array using .numpy()
            similarities = cosine_similarity([sentence_embeddings[sentences.index(sentence)].cpu().numpy()], selected_embeddings)
            if np.max(similarities) < threshold:
                selected_sentences.append(sentence)
                selected_embeddings.append(sentence_embeddings[sentences.index(sentence)].cpu().numpy()) # Move the tensor to CPU using .cpu() and then convert it to a NumPy array using .numpy()
        else:
            selected_sentences.append(sentence)
            # Move the tensor to CPU using .cpu() and then convert it to a NumPy array using .numpy()
            selected_embeddings.append(sentence_embeddings[sentences.index(sentence)].cpu().numpy())

        if len(selected_sentences) >= 100:  # Select top 50 sentences
            break

    return selected_sentences

# Remove redundant sentences
final_sentences = remove_redundancy(ranked_sentences_with_bias, sentence_embeddings)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def boost_by_keywords(sentences, document, boost_factor=1.2):
    """
    Boost the scores of sentences containing key terms.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([document])
    feature_names = vectorizer.get_feature_names_out()

    keywords = [feature_names[idx] for idx in np.argsort(np.array(tfidf_matrix.sum(axis=0)).flatten())[::-1][:5]]

    boosted_sentences = []
    for sentence in sentences:
        score_boost = boost_factor if any(keyword in sentence for keyword in keywords) else 1.0
        boosted_sentences.append((sentence, score_boost))

    return boosted_sentences

# Boost the scores of sentences by keywords
boosted_sentences = boost_by_keywords(final_sentences, document)


In [None]:
def generate_summary(ranked_sentences, num_sentences=50):
    """
    Generate a summary by selecting the top-ranked sentences.

    Parameters:
    ranked_sentences (list): A list of (sentence, score) tuples.
    num_sentences (int): The number of top-ranked sentences to include in the summary.

    Returns:
    str: The generated summary.
    """
    # Select the top N sentences based on ranking
    selected_sentences = [sentence for sentence, score in ranked_sentences[:num_sentences]]

    # Join the selected sentences to form the summary
    summary = ' '.join(selected_sentences)

    return summary

summary = generate_summary(boosted_sentences)
print("Summary:\n", summary)

In [None]:
def sliding_window_summarization(document, window_size=500, overlap=100):
    """
    Summarize the document by applying summarization over sliding windows.

    Parameters:
    document (str): The full document as a string.
    window_size (int): The size of each window in characters.
    overlap (int): The overlap between consecutive windows.

    Returns:
    str: The combined summary from all windows.
    """
    summary = ""
    start = 0
    end = window_size

    # Iterate through document in windows
    while start < len(document):
        window_text = document[start:end]
        sentences = tokenize_sentences(window_text)
        window_summary = summarization_pipeline(sentences, prompt_embedding, sentence_embeddings, ...)

        summary += window_summary + " "  # Add the window summary to the final summary

        # Move the window
        start = end - overlap
        end = start + window_size

    return summary

summary = generate_summary(boosted_sentences)
print("Summary:\n", summary)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import numpy as np

# Simulate a reference summary for ROUGE evaluation (this would be the ground truth summary)
reference_summary = "The Master Subcontract Agreement between W.E. O'Neil Construction Co. of Arizona and Blind Ideas, entered into on October 30, 2023, sets forth terms for future projects. It specifies that the Subcontractor will provide all necessary labor, equipment, materials, and services, and is required to comply with federal, state, and local laws and regulations. The Subcontractor must ensure a safe working environment and the safety of their employees. Payment is governed by the Prime Contract, with retention possible for reasons such as defective work or non-payment to sub-subcontractors or suppliers. Disputes are to be resolved through direct discussions and mediation, and if necessary, arbitration according to Judicial Arbitration and Mediation Services (JAMS) rules. The prevailing party is entitled to recover legal costs, including attorney and expert fees.The Subcontractor is required to maintain various insurance policies, including Worker's Compensation, Commercial General Liability, and Automobile Liability, and must indemnify the General Contractor against any claims arising from non-compliance with insurance or safety requirements. Payment is made within seven days of the General Contractor receiving payment from the Owner, contingent on the Subcontractor’s submission of required documentation such as invoices, waivers of lien, certified payroll records, and safety compliance statements. The Subcontractor’s warranty covers all materials and labor, and any defects must be corrected at their own expense. Final payment is contingent on fulfilling all Subcontract obligations, including submitting punch lists and warranties.The agreement also prohibits the Subcontractor from assigning or transferring work without written consent. Modifications must be in writing and signed by both parties, and the Subcontractor must adhere to prevailing wage laws and other employment regulations. The agreement allows for termination by either party for cause or convenience, subject to certain conditions. Failure to comply with safety regulations or other obligations may result in withheld payments or backcharges. Additional provisions address indemnification, confidentiality, governing law, and force majeure. This summary provides a general overview of the agreement’s terms, and the full document should be consulted for detailed legal implications."

# Function to compute ROUGE score
def compute_rouge(predicted_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, predicted_summary)
    return scores['rouge1'].fmeasure, scores['rougeL'].fmeasure

# Function to perform the entire summarization process
def summarization_pipeline(sentences, prompt_embedding, sentence_embeddings, positional_decay, redundancy_threshold, boost_factor):
    """
    Pipeline to generate summaries using the given hyperparameters.
    """
    # Step 1: Compute similarity and rank
    ranked_sentences = compute_similarity_and_rank(prompt_embedding, sentence_embeddings, sentences)

    # Step 2: Apply positional bias
    ranked_sentences_with_bias = apply_positional_bias(ranked_sentences, decay_factor=positional_decay)

    # Step 3: Remove redundancy
    final_sentences = remove_redundancy(ranked_sentences_with_bias, sentence_embeddings, threshold=redundancy_threshold)

    # Step 4: Boost by keywords
    boosted_sentences = boost_by_keywords(final_sentences, document, boost_factor=boost_factor)

    # Step 5: Generate the final summary
    summary = generate_summary(boosted_sentences)

    return summary

# Define hyperparameter search space
positional_decay_values = [0.8, 0.85, 0.9, 0.95, 1.0]
redundancy_threshold_values = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
boost_factor_values = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5]

# Grid search for hyperparameter tuning
def hyperparameter_tuning(sentences, prompt_embedding, sentence_embeddings):
    best_hyperparams = None
    best_rouge_score = 0

    # Loop over all combinations of hyperparameters
    for positional_decay in positional_decay_values:
        for redundancy_threshold in redundancy_threshold_values:
            for boost_factor in boost_factor_values:
                # Generate summary for this combination
                generated_summary = summarization_pipeline(
                    sentences, prompt_embedding, sentence_embeddings,
                    positional_decay, redundancy_threshold, boost_factor
                )

                # Compute ROUGE score
                rouge1_f1, rougeL_f1 = compute_rouge(generated_summary, reference_summary)

                # Average ROUGE score
                avg_rouge_score = (rouge1_f1 + rougeL_f1) / 2

                # Check if this is the best score
                if avg_rouge_score > best_rouge_score:
                    best_rouge_score = avg_rouge_score
                    best_hyperparams = {
                        'positional_decay': positional_decay,
                        'redundancy_threshold': redundancy_threshold,
                        'boost_factor': boost_factor
                    }

                print(f"positional_decay={positional_decay}, redundancy_threshold={redundancy_threshold}, boost_factor={boost_factor} -> ROUGE1: {rouge1_f1}, ROUGE-L: {rougeL_f1}")

    return best_hyperparams, best_rouge_score

# Perform hyperparameter tuning
best_hyperparams, best_rouge_score = hyperparameter_tuning(sentences, prompt_embedding, sentence_embeddings)

print(f"Best Hyperparameters: {best_hyperparams}")
print(f"Best ROUGE Score: {best_rouge_score}")


Maybe use another LLM to process an abstractive summary


In [None]:
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Summarize using T5 with a custom prompt
def summarize_with_t5(custom_prompt, max_length=3000, output_file="summary.txt"):
    # Load T5 model and tokenizer
    model_name = "t5-small"  # You can also try "t5-base" or "t5-large"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Prepare input with the custom prompt
    input_text = f"{custom_prompt} The important ranked statements are :{boosted_sentences} use this to make a comprehensice summary of : {sentences}"

    # Tokenize the input text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=4096, truncation=False)

    # Generate the summary
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=8, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Save the summary to a text file
    with open(output_file, "w") as file:
        file.write(summary)

    return summary

# Example usage
custom_prompt = "Summarize the key elements and critical processes involved in the supply chain, including procurement, production, logistics, inventory management, and distribution. The summary should cover potential challenges, efficiencies, and strategic impacts, while also addressing any penalties, termination clauses, legal considerations, and intricate details that might affect operations. Ensure that the summary captures the essence of all sections, including risk management strategies, compliance with regulations, sustainability practices, and the implications of technological integration. The final summary should distill all pages of content into a comprehensive overview, retaining all critical points and nuances."
summary = summarize_with_t5(custom_prompt)
print(summary)
