In [23]:
# --- Environment and path setup ---
import sys
from pathlib import Path
import torch, transformers

# Adjust path so we can import from src/
repo_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(repo_root))

print("Repo root:", repo_root)
print("src exists:", (repo_root / "src").exists())

# Confirm key libraries
print("Python:", sys.version)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)


Repo root: C:\Users\csain\Downloads\podifyai_deliverable1 (1)
src exists: True
Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Torch: 2.8.0+cpu
Transformers: 4.51.3


In [25]:
# --- Check data folder and create a sample file if none exists ---
data_dir = repo_root / "data"
data_dir.mkdir(exist_ok=True)

# Show current files
existing_files = [p.name for p in data_dir.glob("*")]
print("Files in data/:", existing_files)

# Create a small text file if the folder is empty
if not existing_files:
    sample_path = data_dir / "sample.txt"
    sample_path.write_text(
        "PodifyAI converts documents into summaries using NLP.\n"
        "This is a small text file for verifying extraction and summarization.",
        encoding="utf-8"
    )
    print("Created sample file:", sample_path)


Files in data/: ['Clustering.pdf', 'Sample.pdf']


In [43]:
# --- Test text extraction from any supported format ---
from src.extractors import detect_and_extract_text, clean_text, truncate_for_demo

# Choose your test file 
TEST_PATH = data_dir / "Clustering.pdf"   # or "sample.pdf", "demo.docx", etc.
print("Using file:", TEST_PATH, "exists:", TEST_PATH.exists())

try:
    raw = detect_and_extract_text(str(TEST_PATH))
    print("Extraction successful. Characters extracted:", len(raw))
except Exception as e:
    print("Error during extraction:", e)
    raw = (
        "PodifyAI demo text. Upload any supported file (PDF, DOCX, PPTX, TXT, MD, HTML, CSV) "
        "to extract text and summarize. " * 5
    )

text = truncate_for_demo(clean_text(raw))
print(text[:800])


Using file: C:\Users\csain\Downloads\podifyai_deliverable1 (1)\data\Clustering.pdf exists: True
Extraction successful. Characters extracted: 23633
Clustering Dr. Christan Grant Dr. Laura Melissa Cruz Castro CAP5771 – Introduction to Data Science University of Florida Outline Basic Concepts of Cluster Analysis Partitioning Methods Hierarchical Methods Density methods Density-based and grid-based methods What is Cluster Analysis? • What is a cluster? • A cluster is a collection of data objects which are • Similar (or related) to one another within the same group (i.e., cluster) • Dissimilar (or unrelated) to the objects in other groups (i.e., clusters) • Cluster analysis (or clustering, data segmentation, …) • Given a set of data points, partition them into a set of groups (i.e., clusters) which are as similar as possible • Cluster analysis is unsupervised learning (i.e., no predefined classes) • This contrasts with classification (i.e


In [45]:
# --- Length-safe, longer summaries with tunable word targets ---
from src.summarizer import get_summarizer
import re

sm = get_summarizer()
tok = sm.tokenizer
max_in = getattr(sm.model.config, "max_position_embeddings", 1024) - 16  # model input cap

# === knobs you can tweak ===
PER_CHUNK_TARGET_WORDS = 180     # make bigger for longer chunk summaries (e.g., 220–300)
FINAL_TARGET_WORDS     = 450     # final merged summary length (e.g., 600–900)
MIN_WORDS_FRACTION     = 0.35    # min_length = fraction of max_length
SENT_JOIN              = " "     # how we join sentences
# ===========================

def words_to_tokens(words: int) -> int:
    # heuristic: ~1 word ≈ 1.3 tokens for BART tokenizer
    return max(32, int(words * 1.3))

def _summarize_block(txt: str, target_words: int) -> str:
    tgt_tok = words_to_tokens(target_words)
    min_tok = max(30, int(tgt_tok * MIN_WORDS_FRACTION))
    out = sm(
        txt,
        max_length=tgt_tok,
        min_length=min_tok,
        do_sample=False,
        truncation=True,
    )
    return out[0]["summary_text"].strip()

def summarize_chunked_long(txt: str) -> str:
    # If short enough, do a single pass with FINAL_TARGET_WORDS
    if len(tok(txt)["input_ids"]) <= max_in:
        return _summarize_block(txt, FINAL_TARGET_WORDS)

    # Sentence-based chunking to keep chunks near the model limit
    sentences = re.split(r'(?<=[.!?])\s+', txt)
    parts, cur = [], ""
    for s in sentences:
        cand = (cur + SENT_JOIN + s).strip() if cur else s
        if len(tok(cand)["input_ids"]) <= max_in:
            cur = cand
        else:
            if cur:
                parts.append(cur)
            cur = s
    if cur:
        parts.append(cur)

    # Summarize each chunk with a reasonably long target
    chunk_summaries = [_summarize_block(p, PER_CHUNK_TARGET_WORDS) for p in parts]

    # Merge the chunk summaries and compress gently to a longer final target
    merged = " ".join(chunk_summaries)
    final = _summarize_block(merged, FINAL_TARGET_WORDS)
    return final

summary = summarize_chunked_long(text)
print("Generated summary (chars):", len(summary))
print()
print(summary)


Your max_length is set to 234, but your input_length is only 230. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=115)
Your max_length is set to 585, but your input_length is only 177. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)


Generated summary (chars): 981

Clustering is unsupervised learning (i.e., no predefined classes) A cluster is a collection of data objects which are similar (or related) to one another within the same group . A good clustering method will produce high quality clusters . Quality of clustering depends on the similarity measure used by the method, and the ability to discover some or all of the hidden patterns . Density-based and grid-based methods include high-dimensional clustering . High-dimensional clusters: Partitioning a database D of n objects into a set of k clusters, such that the sum of squared distances is minimized . Probabilistic and generative models: Modeling data from a generative process . Partitionsing method: Partitionsed method . High dimensional clustering: Partionsing method . The answer is typically highly subjective. The answer to clustering is often highly subjective . Clustersing is often used to find hidden patterns in a data set of objects with different types 

In [47]:
from pathlib import Path

results_dir = repo_root / "results"
results_dir.mkdir(parents=True, exist_ok=True)

output_path = results_dir / "sample_summary.txt"
output_path.write_text(summary, encoding="utf-8")

print("Summary saved to:", output_path.resolve())


Summary saved to: C:\Users\csain\Downloads\podifyai_deliverable1 (1)\results\sample_summary.txt
