In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    """
    Extracts and concatenates text from all pages of a PDF.

    :param file_path: Path to the PDF file
    :return: Full extracted text as a string
    """
    doc = fitz.open(file_path)
    text = ""

    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()
        text += f"\n--- Page {page_num} ---\n{page_text}"
    
    doc.close()
    return text


In [None]:
import os
print("Current directory:", os.getcwd())

os.chdir(r"C:\Users\shivam\Documents\GitHub\Awesome-projects")


In [None]:
if __name__ == "__main__":
    # sample_path = r"C:\Users\shivam\Downloads\oceans document.pdf"
    sample_path = "oceans document.pdf"
    text = extract_text_from_pdf(sample_path)
    print(text[:1000])  # Print the first 1000 characters


In [None]:
import re

def clean_text(text):
    """
    Remove extra whitespace and irrelevant formatting.

    :param text: Raw extracted text
    :return: Cleaned text
    """
    text = re.sub(r'\n+', '\n', text)  # Collapse multiple newlines
    text = re.sub(r'\s+', ' ', text)   # Collapse excessive whitespace
    return text.strip()


In [None]:
raw_text = extract_text_from_pdf(sample_path)
cleaned_text = clean_text(raw_text)

In [None]:
cleaned_text

In [None]:
import os
import openai
from openai import OpenAI

In [None]:
# import os

# api_key = os.getenv("OPENAI_API_KEY")
# print("API key found!" if api_key else "API key NOT found!")

# for testing

client = OpenAI(
  api_key=api_key
)
# openai.api_key = api_key

def get_summary(text, model="gpt-4"):
    prompt = f"""
    You are a business analyst assistant. Given the following business report, please provide an executive summary in 5 concise bullet points:

    Report:
    \"\"\"{text}\"\"\"

    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    summary = response["choices"][0]["message"]["content"]
    return summary


In [None]:
import openai

def get_recommendations(text, model="gpt-4"):
    """
    Generates 3-5 actionable next-step recommendations based on the report text.

    :param text: Cleaned business report text
    :param model: OpenAI model to use
    :return: Recommendations as a string
    """
    prompt = f"""
You are a senior business strategist.

Based on the following business report, generate 3 to 5 **actionable next-step recommendations** that an executive team should consider. Recommendations should be strategic, concise, and realistic.

Report:
\"\"\"{text}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=500
    )
    return response["choices"][0]["message"]["content"]


In [None]:
# text chunking

# 📦 Step 1: Chunk the Text

import tiktoken

def chunk_text_by_words(text, max_words=800, overlap=100):
    """
    Split long text into overlapping chunks based on word count.

    :param text: Cleaned input text
    :param max_words: Max words per chunk
    :param overlap: Number of overlapping words between chunks
    :return: List of text chunks
    """
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_words - overlap

    return chunks



# 💡 Step 2: Summarize Each Chunk

def summarize_chunks(chunks):
    summaries = []

    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i + 1} of {len(chunks)}...")
        summary = get_summary(chunk)
        summaries.append(f"Chunk {i + 1} Summary:\n{summary}")
    
    return summaries

# 🔗 Step 3: Combine Summaries Into a Final Summary

def aggregate_summaries(summaries):
    combined = "\n\n".join(summaries)
    
    final_prompt = f"""
You are a strategy consultant. Here are summaries of sections from a long business report.

Please generate a final **executive summary** in 5 bullet points, incorporating the main insights across all sections.

Sections:
\"\"\"{combined}\"\"\"
"""
    
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.5,
        max_tokens=500
    )
    return response["choices"][0]["message"]["content"]

# full flow

text = extract_text_from_pdf("your_report.pdf")
cleaned_text = clean_text(text)

chunks = chunk_text(cleaned_text)
summaries = summarize_chunks(chunks)
final_summary = aggregate_summaries(summaries)

print("\n📋 Final Executive Summary:\n", final_summary)


In [None]:
# summarization_pipeline.py

import openai

def get_summary(text, model="gpt-4"):
    prompt = f"""
You are a business analyst assistant. Summarize the following section of a business report in 5 concise bullet points:

\"\"\"{text}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    return response["choices"][0]["message"]["content"]


def summarize_chunks(chunks):
    summaries = []

    for i, chunk in enumerate(chunks):
        print(f"🔹 Summarizing chunk {i + 1}/{len(chunks)}")
        summary = get_summary(chunk)
        summaries.append(summary)

    return summaries


def aggregate_summaries(summaries):
    combined = "\n\n".join(summaries)
    prompt = f"""
You are a senior strategy consultant. Given the summaries of different sections of a business report below, generate a final executive summary in 5 key bullet points.

Section Summaries:
\"\"\"{combined}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    return response["choices"][0]["message"]["content"]



# ✅ Step 3: Use It in Your Notebook or Script

from summarization_pipeline import (
    chunk_text_by_words,
    summarize_chunks,
    aggregate_summaries,
)
from parser import extract_text_from_pdf, clean_text

# Load and clean
text = extract_text_from_pdf("report.pdf")
cleaned_text = clean_text(text)

# Chunk and summarize
chunks = chunk_text_by_words(cleaned_text, max_words=800, overlap=100)
chunk_summaries = summarize_chunks(chunks)

# Aggregate into final summary
final_summary = aggregate_summaries(chunk_summaries)

print("\n📋 Final Executive Summary:\n", final_summary)


In [None]:
pip install streamlit


In [None]:
import streamlit as st

st.title("Hello, Streamlit!")
st.write("This is your first Streamlit app.")