In [17]:
pip install faiss-cpu language-tool-python





[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
import pandas as pd

def load_data():
    # Load dataset
    dataset1 = '../Model-Training/bio_summary_keywords.csv'
    dataset2 = '../Model-Training/biology_information_retrieval_sample.csv'

    # Load summarization dataset
    summary_df = pd.read_csv(dataset1, encoding='ISO-8859-1')
    long_texts = summary_df['Long Text'].tolist()
    summaries = summary_df['Summary'].tolist()
    keywords = summary_df['Keywords'].tolist()

    # Load notes dataset
    notes_df = pd.read_csv(dataset2, encoding='ISO-8859-1')
    notes_content = notes_df['Text Content'].tolist()
    notes_topics = notes_df['Topic'].tolist()
    notes_subtopics = notes_df['Sub-topic'].tolist()

    return long_texts, summaries, keywords, notes_content

# Load the data
long_texts, summaries, keywords, notes_content = load_data()


In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to the fine-tuned model
model_path = 'D:/Downloads/RP/Summarization/flan_t5_finetuned_model-20241119T102614Z-001/flan_t5_finetuned_model'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


In [20]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

def initialize_faiss(long_texts, notes_content, embedder):
    # Generate embeddings for long texts and notes
    long_text_embeddings = embedder.encode(long_texts)
    notes_embeddings = embedder.encode(notes_content)

    # Combine embeddings
    all_embeddings = np.concatenate([long_text_embeddings, notes_embeddings], axis=0)

    # Create FAISS index
    index = faiss.IndexFlatL2(all_embeddings.shape[1])
    index.add(all_embeddings)

    return index

# Initialize Sentence Embedder and FAISS Index
embedder = SentenceTransformer('all-MiniLM-L6-v2')
faiss_index = initialize_faiss(long_texts, notes_content, embedder)


In [21]:
def postprocess_summary(summary):
    """Capitalize the first letter of each sentence."""
    summary = summary.strip()

    # Capitalize first letter of each sentence
    sentences = summary.split(". ") 
    sentences = [s.strip().capitalize() for s in sentences if s]

    # Rejoin sentences with proper spacing and punctuation
    summary = ". ".join(sentences).strip()

    # Ensure final punctuation
    if summary and summary[-1] not in ".!?":
        summary += "."

    return summary


In [22]:
def retrieve_relevant_content(query, embedder, index, long_texts, summaries):
    # Query embedding
    query_embedding = embedder.encode([query]).astype("float32")

    # Search the FAISS index
    distances, indices = index.search(query_embedding, k=3)

    # Retrieve the top results
    relevant_texts = []
    for idx in indices[0]:
        if idx < len(long_texts):  # Found in the long texts
            relevant_texts.append(long_texts[idx])

    return relevant_texts


In [23]:
# Function to ensure the summary fits within the desired word count range and finishes with a complete sentence."""

def truncate_to_word_count(text, max_words):
    words = text.split()

    if len(words) > max_words:
        truncated_text = " ".join(words[:max_words])

        # Find the last punctuation efficiently
        for i in range(len(truncated_text) - 1, -1, -1):
            if truncated_text[i] in ".!?":
                return truncated_text[:i + 1]  # Return up to the last punctuation

        # If no punctuation is found, return strict truncation
        return " ".join(words[:max_words - 1]).strip()

    return text


In [24]:
def generate_summary_for_long_text(long_text, min_words=100, max_words=250):
    from textwrap import wrap

    # Helper function to chunk text
    def chunk_text(text, max_tokens=500):
        words = text.split()
        chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]
        return chunks

    # Check if input exceeds the max token limit
    max_input_words = 390  # ~512 tokens
    if len(long_text.split()) > max_input_words:
        # Chunk the input into smaller parts
        chunks = chunk_text(long_text, max_tokens=max_input_words)

        # Generate a summary for each chunk and combine the results
        summaries = [generate_summary_for_long_text(chunk, min_words, max_words) for chunk in chunks]
        combined_summary = " ".join(summaries)

        # Ensure the combined summary fits within the final word range
        return truncate_to_word_count(combined_summary, max_words)

    # For shorter inputs, generate the summary directly
    prompt = (
        f"Generate a concise, well-structured, and grammatically correct summary for the following content:\n\n"
        f"{long_text}\n\nSummary:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_words * 2,   # Allow for token-to-word conversion (~1.3x)
        min_length=min_words,      # Enforce minimum token count
        length_penalty=1.2,
        num_beams=4,
        repetition_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summary = postprocess_summary(summary)

    # Truncate summary to fit exact word count range and finish the last sentence
    return truncate_to_word_count(summary, max_words)


In [25]:
# Function to generate a summary
def generate_summary(query, min_words=100, max_words=250):

    prompt = f"Summarize the following content related to '{query}':\n{query}\nSummary:"

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_words,
        min_length=min_words,
        length_penalty=1.5,
        num_beams=4,
        repetition_penalty=3.0,
        early_stopping=True,
    )

    # Decode the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [26]:
# Main function
def main():
    # Load data
    long_texts, summaries, keywords, notes_content = load_data()

    # Load the embedding model and initialize FAISS index
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    faiss_index = initialize_faiss(long_texts, notes_content, embedder)

# Run the script
if __name__ == "__main__":
    main()

In [27]:
# Retrieve relevant content
query = "Heart"
relevant_texts = retrieve_relevant_content(query, embedder, faiss_index, long_texts, summaries)

# Generate summary based on relevance
if relevant_texts:
    # Join all relevant texts into one string before passing to summary function
    long_text_combined = " ".join(relevant_texts)

    summary = generate_summary_for_long_text(long_text_combined, min_words=100, max_words=250)
else:
    summary = "No relevant content found."

# Print the final cleaned summary
print("Cleaned Summary:")
print(summary)


Cleaned Summary:
The heart is a roughly cone-shaped hollow muscular organ. It is about 10 cm long and weighs about 225 g in women and 310 g in men. It lies in the thoracic cavity in the mediastinum between the lungs anteriorly the sternum, ribs and intercostal muscles structure the heart is composed of three layers pericardium, myocardium and endocardium. The outer sac consists of fibrous tissue and the inner of a continuous double layer of serous membrane. Cardiac muscle is found only in the heart. It is not under voluntary control but crossstripes are seen on microscopic examination. Each fibre cell has a nucleus and one or more branches. The ends of the cells and their branches are in close contact with the ends and branches of adjacent cells. When an impulse is initiated it spreads from cell to cell via the branches and intercalated discs over the whole sheet of muscle causing contraction. The myocardium is thickest at the apex and thins out towards the base. The atria and ventricl