In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    """
    Extracts and concatenates text from all pages of a PDF.

    :param file_path: Path to the PDF file
    :return: Full extracted text as a string
    """
    doc = fitz.open(file_path)
    text = ""

    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()
        text += f"\n--- Page {page_num} ---\n{page_text}"
    
    doc.close()
    return text


In [None]:
import os
print("Current directory:", os.getcwd())

os.chdir(r"C:\Users\shivam\Documents\GitHub\Awesome-projects")


Current directory: C:\Users\shivam


In [13]:
if __name__ == "__main__":
    # sample_path = r"C:\Users\shivam\Downloads\oceans document.pdf"
    sample_path = "oceans document.pdf"
    text = extract_text_from_pdf(sample_path)
    print(text[:1000])  # Print the first 1000 characters



--- Page 1 ---
SWORDFISH
Market 
Analysis 
Report
BARBADOS
UNCTAD and United Nations Division 
for Ocean Affairs and the Law of the Sea 
Oceans Economy and Trade Strategies Project
 

--- Page 2 ---

--- Page 3 ---
SWORDFISH
Market 
Analysis 
Report
BARBADOS
UNCTAD and United Nations Division 
for Ocean Affairs and the Law of the Sea 
Oceans Economy and Trade Strategies Project
Geneva, 2022

--- Page 4 ---
© 2022, United Nations
This work is available through open access, by complying with the Creative Commons licence created for 
intergovernmental organizations, at http://creativecommons.org/licenses/by/3.0/igo/.
The findings, interpretations and conclusions expressed herein are those of the author(s) and do not 
necessarily reflect the views of the United Nations or its officials or Member States. 
The designations employed and the presentation of material on any map in this work do not imply the 
expression of any opinion whatsoever on the part of the United Nations concerning the 

In [14]:
import re

def clean_text(text):
    """
    Remove extra whitespace and irrelevant formatting.

    :param text: Raw extracted text
    :return: Cleaned text
    """
    text = re.sub(r'\n+', '\n', text)  # Collapse multiple newlines
    text = re.sub(r'\s+', ' ', text)   # Collapse excessive whitespace
    return text.strip()


In [15]:
raw_text = extract_text_from_pdf(sample_path)
cleaned_text = clean_text(raw_text)

In [16]:
cleaned_text

'--- Page 1 --- SWORDFISH Market Analysis Report BARBADOS UNCTAD and United Nations Division for Ocean Affairs and the Law of the Sea Oceans Economy and Trade Strategies Project --- Page 2 --- --- Page 3 --- SWORDFISH Market Analysis Report BARBADOS UNCTAD and United Nations Division for Ocean Affairs and the Law of the Sea Oceans Economy and Trade Strategies Project Geneva, 2022 --- Page 4 --- © 2022, United Nations This work is available through open access, by complying with the Creative Commons licence created for intergovernmental organizations, at http://creativecommons.org/licenses/by/3.0/igo/. The findings, interpretations and conclusions expressed herein are those of the author(s) and do not necessarily reflect the views of the United Nations or its officials or Member States. The designations employed and the presentation of material on any map in this work do not imply the expression of any opinion whatsoever on the part of the United Nations concerning the legal status of a

In [None]:
# import os

# api_key = os.getenv("OPENAI_API_KEY")
# print("API key found!" if api_key else "API key NOT found!")

# for testing

os.environ["OPENAI_API_KEY"] = "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"

api_key = os.getenv("OPENAI_API_KEY")
print("API key found!" if api_key else "API key NOT found!")

API key found!


In [None]:
import os
import openai
from openai import OpenAI

In [None]:
client = OpenAI(
  api_key=api_key
)
# openai.api_key = api_key

def get_summary(text, model="gpt-4"):
    prompt = f"""
    You are a business analyst assistant. Given the following business report, please provide an executive summary in 5 concise bullet points:

    Report:
    \"\"\"{text}\"\"\"

    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    summary = response["choices"][0]["message"]["content"]
    return summary


In [None]:
import openai

def get_recommendations(text, model="gpt-4"):
    """
    Generates 3-5 actionable next-step recommendations based on the report text.

    :param text: Cleaned business report text
    :param model: OpenAI model to use
    :return: Recommendations as a string
    """
    prompt = f"""
You are a senior business strategist.

Based on the following business report, generate 3 to 5 **actionable next-step recommendations** that an executive team should consider. Recommendations should be strategic, concise, and realistic.

Report:
\"\"\"{text}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=500
    )
    return response["choices"][0]["message"]["content"]


In [None]:
# text chunking

# 📦 Step 1: Chunk the Text

import tiktoken

def chunk_text_by_words(text, max_words=800, overlap=100):
    """
    Split long text into overlapping chunks based on word count.

    :param text: Cleaned input text
    :param max_words: Max words per chunk
    :param overlap: Number of overlapping words between chunks
    :return: List of text chunks
    """
    words = text.split()
    chunks = []

    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_words - overlap

    return chunks



# 💡 Step 2: Summarize Each Chunk

def summarize_chunks(chunks):
    summaries = []

    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i + 1} of {len(chunks)}...")
        summary = get_summary(chunk)
        summaries.append(f"Chunk {i + 1} Summary:\n{summary}")
    
    return summaries

# 🔗 Step 3: Combine Summaries Into a Final Summary

def aggregate_summaries(summaries):
    combined = "\n\n".join(summaries)
    
    final_prompt = f"""
You are a strategy consultant. Here are summaries of sections from a long business report.

Please generate a final **executive summary** in 5 bullet points, incorporating the main insights across all sections.

Sections:
\"\"\"{combined}\"\"\"
"""
    
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.5,
        max_tokens=500
    )
    return response["choices"][0]["message"]["content"]

# full flow

text = extract_text_from_pdf("your_report.pdf")
cleaned_text = clean_text(text)

chunks = chunk_text(cleaned_text)
summaries = summarize_chunks(chunks)
final_summary = aggregate_summaries(summaries)

print("\n📋 Final Executive Summary:\n", final_summary)


In [None]:
# summarization_pipeline.py

import openai

def get_summary(text, model="gpt-4"):
    prompt = f"""
You are a business analyst assistant. Summarize the following section of a business report in 5 concise bullet points:

\"\"\"{text}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    return response["choices"][0]["message"]["content"]


def summarize_chunks(chunks):
    summaries = []

    for i, chunk in enumerate(chunks):
        print(f"🔹 Summarizing chunk {i + 1}/{len(chunks)}")
        summary = get_summary(chunk)
        summaries.append(summary)

    return summaries


def aggregate_summaries(summaries):
    combined = "\n\n".join(summaries)
    prompt = f"""
You are a senior strategy consultant. Given the summaries of different sections of a business report below, generate a final executive summary in 5 key bullet points.

Section Summaries:
\"\"\"{combined}\"\"\"
"""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=500,
    )
    return response["choices"][0]["message"]["content"]



# ✅ Step 3: Use It in Your Notebook or Script

from summarization_pipeline import (
    chunk_text_by_words,
    summarize_chunks,
    aggregate_summaries,
)
from parser import extract_text_from_pdf, clean_text

# Load and clean
text = extract_text_from_pdf("report.pdf")
cleaned_text = clean_text(text)

# Chunk and summarize
chunks = chunk_text_by_words(cleaned_text, max_words=800, overlap=100)
chunk_summaries = summarize_chunks(chunks)

# Aggregate into final summary
final_summary = aggregate_summaries(chunk_summaries)

print("\n📋 Final Executive Summary:\n", final_summary)


In [4]:
pip install streamlit


^C
Note: you may need to restart the kernel to use updated packages.


Collecting streamlit
  Using cached streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting numpy<3,>=1.23 (from streamlit)
  Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting packaging<25,>=20 (from streamlit)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pandas<3,>=1.4.0 (from streamlit)
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pillow<12,>=7.1.0 (from streamlit)
  Using cached pillow-11.2.1-cp311-cp311-win_amd64.whl.metadata (9.1 kB)
Collecting protobuf<

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\shivam\\pymupdf_env_311\\Lib\\site-packages\\tenacity\\after.py'
Check the permissions.



In [1]:
import streamlit as st

st.title("Hello, Streamlit!")
st.write("This is your first Streamlit app.")

2025-05-30 09:35:35.088 
  command:

    streamlit run C:\Users\shivam\pymupdf_env_311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
