In [16]:
import pandas as pd

In [17]:
# Load the notes dataset
notes_df = pd.read_csv('../Model-Training/biology_information_retrieval_sample.csv', encoding='ISO-8859-1')  
notes_content = notes_df['Text Content'].tolist()
notes_topics = notes_df['Topic'].tolist()
notes_subtopics = notes_df['Sub-topic'].tolist()

In [19]:
# Load the summarization dataset
summary_df = pd.read_csv('../Model-Training/bio_summary_keywords.csv', encoding='ISO-8859-1')
long_texts = summary_df['Long Text'].tolist()
summaries = summary_df['Summary'].tolist()
keywords = summary_df['Keywords'].tolist()

In [20]:
pip install sentence-transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for summarization dataset (long texts)
summary_embeddings = embedder.encode(long_texts)

# Generate embeddings for notes dataset
notes_embeddings = embedder.encode(notes_content)

# Combine all content and embeddings for FAISS indexing
all_content = long_texts + notes_content
all_embeddings = np.concatenate([summary_embeddings, notes_embeddings], axis=0)

# Convert embeddings to a float32 NumPy array
all_embeddings_array = np.array(all_embeddings).astype("float32")

# Create and populate the FAISS index
common_index = faiss.IndexFlatL2(all_embeddings_array.shape[1])
common_index.add(all_embeddings_array)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to the fine-tuned model
model_path = 'D:/Downloads/RP/Summarization/flan_t5_finetuned_model-20241119T102614Z-001/flan_t5_finetuned_model' 

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


In [22]:
def postprocess_summary(summary):
    """Capitalize the first letter of each sentence."""
    summary = summary.strip()

    # Capitalize first letter of each sentence
    sentences = summary.split(". ")  # Split on ". " assuming sentences end with periods
    sentences = [s.strip().capitalize() for s in sentences if s]

    # Rejoin sentences with proper spacing and punctuation
    summary = ". ".join(sentences).strip()

    # Ensure final punctuation
    if summary and summary[-1] not in ".!?":
        summary += "."

    return summary


In [23]:
def generate_summary_for_long_text(long_text, min_words=150, max_words=350):
    from textwrap import wrap

    # function to chunk text
    def chunk_text(text, max_tokens=500):
        words = text.split()
        chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]
        return chunks

    # Check if input exceeds the max token limit
    max_input_words = 390  # ~512 tokens
    if len(long_text.split()) > max_input_words:
        # Chunk the input into smaller parts
        chunks = chunk_text(long_text, max_tokens=max_input_words)

        # Generate a summary for each chunk and combine the results
        summaries = [generate_summary_for_long_text(chunk, min_words, max_words) for chunk in chunks]
        combined_summary = " ".join(summaries)

        # Ensure the combined summary fits within the final word range
        return truncate_to_word_count(combined_summary, max_words)

    # For shorter inputs, generate the summary directly
    prompt = (
        f"Generate a concise, well-structured, and grammatically correct summary for the following content:\n\n"
        f"{long_text}\n\nSummary:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_words * 2,   # Allow for token-to-word conversion (~1.3x)
        min_length=min_words,      # Enforce minimum token count
        length_penalty=1.2,
        num_beams=4,
        repetition_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summary = postprocess_summary(summary)

    # Truncate summary to fit exact word count range
    return truncate_to_word_count(summary, max_words)

def truncate_to_word_count(text, max_words):
    """Ensure the summary fits within the desired word count range."""
    words = text.split()
    if len(words) > max_words:
        return " ".join(words[:max_words]) + ('.' if text[-1] not in '.!?' else '')
    return text


In [25]:
# Test with a long text
sample_long_text = """
Human ear is divided into three parts; the outer ear, middle ear and inner ear.
Outer wear consists of the piano and auditory canal. Auditory canal is a slightly “S” shaped tube and
lined by hairy skin with numerous modified sweat glands which secrete ear wax. The auditory canal
extends to the tympanic membrane which is located in between the middle and the outer wear.
Middle ear (tympanic cavity) is an air-filled cavity within the temporal bone. It is lined with simple
epithelium. In the medial wall of the middle ear, there are two openings called oval window and
round window. Oval window is covered by a small bone called stapes. The round window is covered
by a fine fibrous tissue. Three very small bones (ear ossicles) called malleus, incus and stapes
extend across the middle ear from tympanic membrane to the oval window. They form movable
joints with each other and the medial wall of the cavity at the oval window. Malleus 1s in contact
with the tympanic membrane and form a movable joint with the incus. Incus articulates with the
stapes which fit with the oval window. A long tube called Eustachian tube connects the middle
ear to the pharynx. 
Inner ear is formed from a network of channels and cavities in temporal bone which are called
bony labyrinth. Within the bony labyrinth, a network of fluid filled membranes called membranous
labyrinth is present which lines and fills the bony labyrinth. The inner ear is composed of three main
regions: vestibule, three semicircular canals and cochlea. Vestibule is the expanded part near
the middle ear. Oval and round windows are present in its lateral walls. The vestibular contains two
membranous sacs called utricle and saccule. Semicircular canals are three tubes arranged at
right angles to one another so that one is situated in each of the three planes of space. They are
continuous with the vestibule. Cochlea is a coiled structure with a broad base which is continuous
with the vestibule. Cochlea has three compartments: an upper vestibular canal, a lower tympanic
canal and middle cochlear duct which is a small canal that separates the upper and lower canals.
Vestibular canal originates at the oval window and the tympanic canal ends at the round window. The
two canals are continuous with each other and filled with perilymph. The cochlear duct is a part
of the membranous labyrinth and filled with endolymph. The floor of the cochlear duct is called
the basilar membrane which bears the organ of Corti (spiral organ). It contains supporting cells
and specialized cochlear hair cells containing mechanoreceptors (auditory receptors) of the ear.
Hairs of the cochlear hair cells project into the cochlear duct. Many hairs are in contact with the
tectorial membrane that hangs over the organ of Corti. Hair cells synapse with the dendrites of
sensory neurons that combine to form the auditory nerve which transmits auditory information
to the brain. 

Functions of the human ear

Hearing

Vibrating objects produce pressure waves in the surrounding air. In hearing, the ear transduces
these pressure waves (mechanical stimulus) to changing membrane potential leading to generation
of nerve impulses that are transmitted to the brain which perceives as sound.
The outer wear collects and concentrates the sound waves and directs them along the auditory
canal towards the tympanic membrane. This causes the tympanic membrane to vibrate. Tympanic
membrane vibrations are transmitted and amplified through the middle ear by the movement of
three jointed ear ossicles.
The ear ossicles transmit the vibrations to the oval window which is located on the membrane
of the cochlear surface. When the stapes vibrates against the oval window, pressure waves are
created in the perilymph inside the cochlea. Most fluid pressure waves in the vestibular canal
are transmitted to the endolymph of cochlea duct which push down the basilar membrane. As a
result, the basilar membrane and attached hair cells vibrate up and down. This causes bending of
hair projecting from the hair cell against the fixed tectorial membrane which lies above the hair
cells. This results in the stimulation of auditory receptors in the auditory hair cells which lead to
generation of nerve impulses. These nerve impulses are passed through the auditory nerve to the
auditory area of the brain (temporal lobe of the cerebrum) for sound perception.
After the sound perception, the fluid wave is finally dissipated into the middle ear by vibration of
the membrane of the round window. The Eustachian tube maintains the air pressure on both sides of
tympanic membrane at the atmospheric pressure level.
Equilibrium
Semicircular canals and vestibule located in the inner ear provide information about the position
of the head in space and contribute to maintain the posture and balance.
Utricle and saccule of the vestibule perceive position with respect to gravity or linear movements.
Each of these endolymph filled chambers contain hair cells that project into a gelatinous material

in which small calcium carbonate particles (otolith) are embedded. When the head is tilted
otoliths press on the hairs projecting into the gels bending the hair bundle. These deflections in
the hair cells cause membrane potential changes which are transmitted as nerve impulses into the
cerebellum.
The semicircular canals, arranged in three spatial planes detect angular movements of the head.
Within each canal, hair cells form a cluster with the hairs projecting into a gelatinous cap. Changes
in the position of the head causes movements in the perilymph and endolymph. As a result, hair
cells are stimulated and resulting nerve impulses are transmitted to the brain. 
"""

In [26]:
print(generate_summary_for_long_text(sample_long_text, min_words=150, max_words=350))

The human ear is divided into three parts: the outer ear, middle ear and inner ear. Outer wear consists of the piano and auditory canal. Middle ear (tympanic cavity) is an air-filled cavity within the temporal bone. Three very small bones (ear ossicles) called malleus, incus and stapes extend across the middle ear from tympanic membrane to the oval window. Three very small bones (ear ossicles) called malleus, incus and stapes extend across the middle ear from tympanic membrane to the oval window. Inner ear is formed from a network of channels and cavities in temporal bone which are called bony labyrinth. The cochlear duct is a part of the membranous labyrinth and filled with endolymph. It contains supporting cells and specialized cochlear hair cells containing mechanoreceptors (auditory receptors) of the ear. Hearing objects produce pressure waves in the surrounding air. In hearing, the ear transduces these pressure waves to changing membrane potential leading to generation of nerve im