<a href="https://colab.research.google.com/github/Ujjawal1709/PPT-Maker/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers
!pip install python-docx
!pip install python-pptx
!pip install sentencepiece

from google.colab import files
from docx import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pptx import Presentation

# Function to extract text from DOCX file
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Function to summarize text using DistilBART
def summarize_text(text, model, tokenizer, max_chunk_length=2048, max_summary_length=300, min_summary_length=80):
    # Break the text into chunks if necessary
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]

    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt", max_length=max_chunk_length, truncation=True)
        summary_ids = model.generate(inputs, max_length=max_summary_length, min_length=min_summary_length, length_penalty=2.0, num_beams=2, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
        print(f"Chunk {i+1} processed.")

    return ' '.join(summaries)

# Function to split summarized text into slides
def split_into_slides(summary, max_words_per_slide=50):
    words = summary.split()
    slides = [' '.join(words[i:i+max_words_per_slide]) for i in range(0, len(words), max_words_per_slide)]
    return slides

# Improved function to generate slide titles based on content analysis
def generate_slide_titles(slides):
    titles = []
    for slide in slides:
        sentences = slide.split('. ')
        if sentences:
            longest_sentence = max(sentences, key=len).strip()
            if len(longest_sentence) > 50:
                title_candidate = ' '.join(longest_sentence.split()[:8]) + '...'
            else:
                title_candidate = longest_sentence
            titles.append(title_candidate)
        else:
            titles.append("Untitled Slide")
    return titles

# Function to create PowerPoint from summarized text
def create_ppt(slides_data, titles, ppt_path='output_presentation.pptx'):
    prs = Presentation()

    for i, slide_data in enumerate(slides_data):
        slide_layout = prs.slide_layouts[1]  # Title and Content layout
        slide = prs.slides.add_slide(slide_layout)

        title = slide.shapes.title
        content = slide.placeholders[1].text_frame

        # Set slide title from generated titles
        slide_title = titles[i]
        title.text = slide_title

        # Add summarized points as bullet points
        paragraphs = slide_data.split('. ')
        for point in paragraphs:
            point = point.strip()  # Remove leading/trailing spaces
            if not point.endswith('.'):  # Ensure proper punctuation
                point += '.'
            p = content.add_paragraph()
            p.text = point

    prs.save(ppt_path)

# Main function to convert document to summarized PowerPoint presentation
def document_to_ppt(docx_path, ppt_output_path):
    try:
        # Step 1: Extract text from DOCX
        print("Extracting text from document...")
        full_text = extract_text_from_docx(docx_path)

        # Step 2: Initialize summarization model and tokenizer
        print("Loading summarization model...")
        tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
        model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

        # Step 3: Summarize text
        print("Summarizing the text...")
        summarized_text = summarize_text(full_text, model, tokenizer)

        # Step 4: Split summary into slides
        print("Splitting summary into slide sections...")
        slides_data = split_into_slides(summarized_text)

        # Step 5: Generate titles for slides
        print("Generating titles for slides...")
        titles = generate_slide_titles(slides_data)

        # Step 6: Create PowerPoint
        print(f"Generating PowerPoint presentation at {ppt_output_path}...")
        create_ppt(slides_data, titles, ppt_output_path)

        print("PowerPoint presentation created successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the converter
if __name__ == "__main__":
    uploaded = files.upload()

    # Ensure the uploaded file is handled correctly
    docx_path = next(iter(uploaded.keys()))

    if not docx_path.endswith('.docx'):
        raise ValueError("Uploaded file is not a DOCX file. Please upload a valid DOCX file.")

    # Output PPTX file
    ppt_output_path = 'output_presentation.pptx'

    # Convert document to PowerPoint
    document_to_ppt(docx_path, ppt_output_path)

    # Download the output PowerPoint file
    files.download(ppt_output_path)
# Install necessary libraries
!pip install transformers
!pip install python-docx
!pip install python-pptx
!pip install sentencepiece

from google.colab import files
from docx import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pptx import Presentation

# Function to extract text from DOCX file
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Function to summarize text using DistilBART
def summarize_text(text, model, tokenizer, max_chunk_length=2048, max_summary_length=300, min_summary_length=80):
    # Break the text into chunks if necessary
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]

    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt", max_length=max_chunk_length, truncation=True)
        summary_ids = model.generate(inputs, max_length=max_summary_length, min_length=min_summary_length, length_penalty=2.0, num_beams=2, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
        print(f"Chunk {i+1} processed.")

    return ' '.join(summaries)

# Function to split summarized text into slides
def split_into_slides(summary, max_words_per_slide=50):
    words = summary.split()
    slides = [' '.join(words[i:i+max_words_per_slide]) for i in range(0, len(words), max_words_per_slide)]
    return slides

# Improved function to generate slide titles based on content analysis
def generate_slide_titles(slides):
    titles = []
    for slide in slides:
        sentences = slide.split('. ')
        if sentences:
            longest_sentence = max(sentences, key=len).strip()
            if len(longest_sentence) > 50:
                title_candidate = ' '.join(longest_sentence.split()[:8]) + '...'
            else:
                title_candidate = longest_sentence
            titles.append(title_candidate)
        else:
            titles.append("Untitled Slide")
    return titles

# Function to create PowerPoint from summarized text
def create_ppt(slides_data, titles, ppt_path='output_presentation.pptx'):
    prs = Presentation()

    for i, slide_data in enumerate(slides_data):
        slide_layout = prs.slide_layouts[1]  # Title and Content layout
        slide = prs.slides.add_slide(slide_layout)

        title = slide.shapes.title
        content = slide.placeholders[1].text_frame

        # Set slide title from generated titles
        slide_title = titles[i]
        title.text = slide_title

        # Add summarized points as bullet points
        paragraphs = slide_data.split('. ')
        for point in paragraphs:
            point = point.strip()  # Remove leading/trailing spaces
            if not point.endswith('.'):  # Ensure proper punctuation
                point += '.'
            p = content.add_paragraph()
            p.text = point

    prs.save(ppt_path)

# Main function to convert document to summarized PowerPoint presentation
def document_to_ppt(docx_path, ppt_output_path):
    try:
        # Step 1: Extract text from DOCX
        print("Extracting text from document...")
        full_text = extract_text_from_docx(docx_path)

        # Step 2: Initialize summarization model and tokenizer
        print("Loading summarization model...")
        tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
        model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

        # Step 3: Summarize text
        print("Summarizing the text...")
        summarized_text = summarize_text(full_text, model, tokenizer)

        # Step 4: Split summary into slides
        print("Splitting summary into slide sections...")
        slides_data = split_into_slides(summarized_text)

        # Step 5: Generate titles for slides
        print("Generating titles for slides...")
        titles = generate_slide_titles(slides_data)

        # Step 6: Create PowerPoint
        print(f"Generating PowerPoint presentation at {ppt_output_path}...")
        create_ppt(slides_data, titles, ppt_output_path)

        print("PowerPoint presentation created successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the converter
if __name__ == "__main__":
    uploaded = files.upload()

    # Ensure the uploaded file is handled correctly
    docx_path = next(iter(uploaded.keys()))

    if not docx_path.endswith('.docx'):
        raise ValueError("Uploaded file is not a DOCX file. Please upload a valid DOCX file.")

    # Output PPTX file
    ppt_output_path = 'output_presentation.pptx'

    # Convert document to PowerPoint
    document_to_ppt(docx_path, ppt_output_path)

    # Download the output PowerPoint file
    files.download(ppt_output_path)
