PPT Text Extractor

In [None]:
import gradio as gr
from pptx import Presentation


#Function to Extract Text from PPT
def extract_text_from_ppt(file):
    prs = Presentation(file.name) #load
    text_data = [] #store

    for slide_number, slide in enumerate(prs.slides, start=1):
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                slide_text.append(shape.text.strip())

        if slide_text:
            text_data.append(f"Slide {slide_number}:\n" + "\n".join(slide_text))

    return "\n\n".join(text_data) if text_data else "No text found in the PPT."

# Create Gradio Interface
iface = gr.Interface(
    fn=extract_text_from_ppt,
    inputs=gr.File(label="Upload PPT File"),
    outputs="text",
    title="PPT Text Extractor",
    description="Upload a .pptx file to extract text from slides."
)

iface.launch()


In [2]:
from transformers import pipeline

summarizer = pipeline("summarization", model="t5-small")

def summarize_presentation(text):
    summary = summarizer(text)
    return summary





Device set to use cpu


PPT Image OCR Extractor

In [3]:
import os
import pytesseract
from pptx import Presentation
from PIL import Image
import cv2
import numpy as np
import io
import gradio as gr
import re

# Set the Tesseract OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def preprocess_image(image):
    """Apply noise removal and thresholding to enhance OCR accuracy"""
    open_cv_image = np.array(image.convert("L"))  # Convert to grayscale

    # Apply Gaussian Blur (Reduces noise)
    blurred = cv2.GaussianBlur(open_cv_image, (5, 5), 0)

    # Adaptive Thresholding for better contrast
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    # Apply Morphological Opening to remove small noise
    kernel = np.ones((2,2), np.uint8)
    clean_image = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)

    return Image.fromarray(clean_image)

def clean_text(text):
    """Filter out noisy characters, symbols, and random text"""
    text = text.strip()

    # Remove common OCR artifacts
    text = re.sub(r"[|*_~—@\[\]]+", "", text)  

    # Remove lines with too many symbols (probably noise)
    text = "\n".join(line for line in text.split("\n") if sum(c.isalnum() for c in line) > len(line) * 0.5)

    # Remove excess whitespace
    text = re.sub(r"\s{2,}", " ", text)

    return text if len(text) > 3 else ""  # Ignore extremely short garbage text

def extract_text_from_images(ppt_file):
    """Extracts text from images in PPT slides using OCR"""
    prs = Presentation(ppt_file)
    extracted_text = []

    for slide_number, slide in enumerate(prs.slides, start=1):
        slide_text = f"Slide {slide_number}:\n"

        for shape in slide.shapes:
            if hasattr(shape, "image"):
                image_stream = io.BytesIO(shape.image.blob)
                image = Image.open(image_stream)
                image = preprocess_image(image)  # Apply enhanced image processing

                # Extract text using pytesseract with better settings
                text = pytesseract.image_to_string(image, lang="eng", config="--psm 6 --oem 3")
                text = clean_text(text)  # Clean OCR output
                
                if text:  # Only add meaningful text
                    slide_text += f"\n[Image Text]: {text}\n"

        extracted_text.append(slide_text)

    return "\n".join(extracted_text) if extracted_text else "No readable text found in images."

def gradio_interface(file):
    if file is None:
        return "Please upload a PPT file."
    
    extracted_text = extract_text_from_images(file.name)
    return extracted_text

iface = gr.Interface(
    fn=gradio_interface, 
    inputs=gr.File(label="Upload PowerPoint File (.pptx)"),
    outputs="text",
    title="PPT Image OCR Extractor",
    description="Upload a PowerPoint file to extract text from images using OCR."
)

if __name__ == "__main__":
    iface.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


direct summeerise

In [None]:
import gradio as gr
from pptx import Presentation
from transformers import pipeline

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to extract text from PPT
def extract_text_from_ppt(file):
    prs = Presentation(file.name)  # Load PPT
    text_data = []  # Store extracted text

    for slide_number, slide in enumerate(prs.slides, start=1):
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                slide_text.append(shape.text.strip())

        if slide_text:
            text_data.append(f"Slide {slide_number}:\n" + "\n".join(slide_text))

    extracted_text = "\n\n".join(text_data) if text_data else "No text found in the PPT."
    return extracted_text

# Function to chunk long text
def chunk_text(text, max_length=1024):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(" ".join(current_chunk)) > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize extracted text
def summarize_ppt(file):
    extracted_text = extract_text_from_ppt(file)  # Extract text
    if extracted_text == "No text found in the PPT.":
        return extracted_text

    chunks = chunk_text(extracted_text)  # Chunk text if needed
    summaries = []

    for chunk in chunks:
        summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    return " ".join(summaries)

# Gradio interface
iface = gr.Interface(
    fn=summarize_ppt,
    inputs=gr.File(label="Upload PPT File"),
    outputs=gr.Textbox(label="Summarized Text"),
    title="AI-Powered PPT Summarizer",
    description="Upload a .pptx file, and this tool will extract and summarize its content using AI.",
)

iface.launch()


In [None]:
import gradio as gr
import os
from langdetect import detect
import language_tool_python
import speech_recognition as sr
import re
from pydub import AudioSegment

# Initialize the recognizer
recognizer = sr.Recognizer()

# Function to convert audio to WAV if it's not already in that format
def convert_to_wav(audio_file):
    if audio_file.lower().endswith(('.mp3', '.ogg', '.m4a', '.acc')):
        audio = AudioSegment.from_file(audio_file)
        wav_file = "converted_audio.wav"
        audio.export(wav_file, format="wav")
        return wav_file
    return audio_file

# Function to process audio with Google STT
def process_audio(file):
    # Convert audio to WAV if necessary
    file = convert_to_wav(file)

    # Step 1: Use Google Speech-to-Text to transcribe
    with sr.AudioFile(file) as source:
        audio = recognizer.record(source)
        
    try:
        transcribed_text = recognizer.recognize_google(audio, language="en-IN")  # Use "en-IN" for multilingual support
    except Exception as e:
        return {"Error": f"Error transcribing audio: {str(e)}"}

    # Step 2: Separate English and Hindi words
    words = re.findall(r'\b\w+\b', transcribed_text)
    english_words = []
    hindi_words = []

    for word in words:
        try:
            lang = detect(word)
            if lang == 'en':
                english_words.append(word)
            elif lang == 'hi':
                hindi_words.append(word)
        except:
            continue

    # Save to files
    with open("english_words.txt", "w", encoding="utf-8") as f_en:
        f_en.write(" ".join(english_words))

    with open("hindi_words.txt", "w", encoding="utf-8") as f_hi:
        f_hi.write(" ".join(hindi_words))

    # Step 3: Check if English sentence is meaningful
    english_sentence = " ".join(english_words)
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(english_sentence)
    is_meaningful = len(matches) == 0

    return {
        "Transcribed Text": transcribed_text,
        "English Sentence": english_sentence,
        "Is Sentence Meaningful?": is_meaningful,
        "English Words File": "english_words.txt",
        "Hindi Words File": "hindi_words.txt"
    }

# Gradio UI
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs="json",
    title="Multilingual Audio Processor with Google STT"
)

interface.launch()


In [None]:
pip install translator

In [None]:
import speech_recognition as sr
import os
import subprocess
import tempfile
import re
from translate import Translator
import langdetect

def translate_hinglish(text, to_lang="en"):
    """Translates text, handling Hinglish words by word."""
    translator = Translator(to_lang=to_lang)
    words = text.split()
    translated_words = []
    for word in words:
        try:
            lang = langdetect.detect(word)
            if lang == "en" or lang == "hi":
                translated_words.append(translator.translate(word))
            else:
                translated_words.append(word) # Keep Hinglish as is.
        except langdetect.LangDetectException:
            translated_words.append(word) # Keep unknown words as is.
    return " ".join(translated_words)

def transcribe_and_translate_audio(audio_file_path, output_file_path="transcription.txt"):
    """Transcribes audio, translates (handling Hinglish), and segregates words."""

    recognizer = sr.Recognizer()
    try:
        _, file_extension = os.path.splitext(audio_file_path)
        file_extension = file_extension.lower()

        if file_extension not in [".wav", ".aiff", ".aiff-c", ".flac"]:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_wav:
                try:
                    subprocess.run(
                        ["ffmpeg", "-i", audio_file_path, temp_wav.name],
                        check=True,
                        capture_output=True,
                        text=True,
                    )
                    audio_file_path = temp_wav.name
                except subprocess.CalledProcessError as e:
                    print(f"Error during audio conversion: {e.stderr}")
                    return

        with sr.AudioFile(audio_file_path) as source:
            audio_data = recognizer.record(source)

        try:
            transcription = recognizer.recognize_google(audio_data, language="en-IN,hi-IN")
        except sr.UnknownValueError:
            transcription = "Could not understand audio"
        except sr.RequestError as e:
            transcription = f"Could not request results from Google Speech Recognition service; {e}"

        if transcription == "Could not understand audio" or "Could not request results from Google Speech Recognition service" in transcription:
            with open(output_file_path, "w", encoding="utf-8") as file:
                file.write(transcription)
            return

        words = transcription.split()
        language_words = {"en": [], "hi": [], "hinglish": [], "unknown": []}

        for word in words:
            word = word.lower()
            try:
                if re.search(r'[a-zA-Z][\u0900-\u097F]', word) or re.search(r'[\u0900-\u097F][a-zA-Z]', word) or re.search(r'[a-zA-Z]+\d+[a-zA-Z]*', word):
                    language_words["hinglish"].append(word)
                elif re.search(r'[\u0900-\u097F]+', word):
                    language_words["hi"].append(word)
                elif re.search(r'[a-zA-Z]+', word):
                    language_words["en"].append(word)
                else:
                    language_words["unknown"].append(word)
            except Exception as e:
                language_words["unknown"].append(word)

        try:
            translated_text = translate_hinglish(transcription)
        except Exception as translation_error:
            translated_text = f"Translation failed: {translation_error}"

        with open(output_file_path, "w", encoding="utf-8") as file:
            file.write("Raw Transcription:\n" + transcription + "\n\n")
            file.write("Translated Transcription(English):\n" + translated_text + "\n\n")
            file.write("English: " + " ".join(language_words["en"]) + "\n")
            file.write("Hindi: " + " ".join(language_words["hi"]) + "\n")
            file.write("Hinglish: " + " ".join(language_words["hinglish"]) + "\n")
            file.write("Unknown: " + " ".join(language_words["unknown"]))

        print(f"Transcription, translation, and language segregation saved to {output_file_path}")

    except FileNotFoundError:
        print(f"Error: Audio file not found at {audio_file_path}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
audio_file = "phase 1 ppt.wav"
if os.path.exists(audio_file):
    transcribe_and_translate_audio(audio_file)
else:
    print(f"Audio file '{audio_file}' does not exist.")

In [None]:
import os
from pptx import Presentation
from translate import Translator
import langdetect

def translate_hinglish(text, to_lang="en"):
    """Translates text, handling Hinglish words by word."""
    translator = Translator(to_lang=to_lang)
    words = text.split()
    translated_words = []
    for word in words:
        try:
            lang = langdetect.detect(word)
            if lang == "en" or lang == "hi":
                translated_words.append(translator.translate(word))
            else:
                translated_words.append(word)  # Keep Hinglish as is.
        except langdetect.LangDetectException:
            translated_words.append(word)  # Keep unknown words as is.
    return " ".join(translated_words)

def extract_text_from_ppt(ppt_file_path, output_file_path="extracted_text.txt"):
    """Extracts text from PPT slides, translates (handling Hinglish), and saves to a file."""

    try:
        # Check if the PPT file exists
        if not os.path.exists(ppt_file_path):
            print(f"Error: PPT file not found at {ppt_file_path}")
            return
        
        # Load the presentation
        presentation = Presentation(ppt_file_path)
        ppt_text = []

        # Extract text from each slide
        for slide in presentation.slides:
            slide_text = []
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    slide_text.append(shape.text.strip())
            ppt_text.append(" ".join(slide_text))

        # Combine all slide text
        combined_text = "\n\n".join(ppt_text)

        # Translate the extracted text, if needed
        try:
            translated_text = translate_hinglish(combined_text)
        except Exception as translation_error:
            translated_text = f"Translation failed: {translation_error}"

        # Save the extracted and translated text to a file
        with open(output_file_path, "w", encoding="utf-8") as file:
            file.write("Extracted Text from PPT:\n" + combined_text + "\n\n")
            file.write("Translated Text (English):\n" + translated_text)

        print(f"Extracted text and translation saved to {output_file_path}")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage:
ppt_file = "phase 1 ppt.pptx"
if os.path.exists(ppt_file):
    extract_text_from_ppt(ppt_file)
else:
    print(f"PPT file '{ppt_file}' does not exist.")


In [None]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from collections import Counter
import textwrap  # Importing textwrap module

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')  # Download POS Tagger

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def load_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read().lower()

def extract_keywords(text, max_keywords=None):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    # Remove stop words and non-alphanumeric words
    keywords = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Lemmatize each word to its base form
    lemmatized_keywords = [lemmatizer.lemmatize(word) for word in keywords]

    # Count word frequency
    keyword_counts = Counter(lemmatized_keywords)
    
    # Sort keywords by frequency and limit them to max_keywords if specified
    sorted_keywords = [keyword for keyword, _ in keyword_counts.most_common(max_keywords)]
    
    return set(sorted_keywords)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())  # Add the synonym to the set
    return synonyms

def compare_keywords(keywords1, keywords2):
    matches = set()
    only_in_1 = set(keywords1)
    only_in_2 = set(keywords2)
    
    # Find exact matches first
    for word1 in keywords1:
        if word1 in keywords2:
            matches.add(word1)
    
    # Match words semantically using synonyms
    for word1 in keywords1:
        for word2 in keywords2:
            if word1 != word2 and (word2 in get_synonyms(word1)):
                matches.add(word1)

    # Remove matched words from both sets
    only_in_1 -= matches
    only_in_2 -= matches
    
    accuracy = (len(matches) / len(keywords1)) * 100 if keywords1 else 0
    return matches, only_in_1, only_in_2, accuracy

def plot_graph(match_count, mismatch_count):
    labels = ['Matching Keywords', 'Missing Keywords']
    values = [match_count, mismatch_count]
    colors = ['green', 'red']
    
    plt.figure(figsize=(7,5))
    plt.bar(labels, values, color=colors)
    plt.title("Keyword Match:presentation vs audio")
    plt.ylabel("Number of Keywords")
    plt.show()

ppt_text = load_file("extracted_text.txt")
speech_text = load_file("transcription.txt")
max_keywords = 50
ppt_keywords = extract_keywords(ppt_text, max_keywords=max_keywords)
speech_keywords = extract_keywords(speech_text, max_keywords=max_keywords)

def generate_feedback(matches, missing_in_speech, extra_in_speech, accuracy):
    feedback = []

    # Feedback on accuracy
    if accuracy >= 90:
        feedback.append("Excellent! Your speech closely matched the presentation content.")
    elif accuracy >= 75:
        feedback.append("Good job! Most of the key points from the presentation were covered.")
    elif accuracy >= 50:
        feedback.append("Fair attempt. You covered some key topics, but quite a few were missed.")
    else:
        feedback.append("Needs Improvement. A large portion of the presentation content was not covered in the speech.")

    # Feedback on missing keywords
    if missing_in_speech:
        feedback.append(f"You missed {len(missing_in_speech)} important keyword(s) from the presentation.")
        few_missed = list(sorted(missing_in_speech))[:10]
        feedback.append("Missed keywords: " + ", ".join(few_missed) + ("..." if len(missing_in_speech) > 10 else ""))

    # Feedback on extra/unrelated keywords
    if extra_in_speech:
        feedback.append(f"You included {len(extra_in_speech)} extra keyword(s) that were not part of the presentation.")
        few_extra = list(sorted(extra_in_speech))[:10]
        feedback.append("Extra keywords: " + ", ".join(few_extra) + ("..." if len(extra_in_speech) > 10 else ""))

   
    print("\nFEEDBACK:")
    for line in feedback:
        print(line)
# Show feedback
generate_feedback(matches, missing_in_speech, extra_in_speech, accuracy)

# Compare
matches, missing_in_speech, extra_in_speech, accuracy = compare_keywords(ppt_keywords, speech_keywords)

# Output results with textwrap for better formatting
print("\n Keywords from PPT:", len(ppt_keywords))
print("Keywords from Transcription:", len(speech_keywords))

# Wrap the keywords and print them
print("\nKeywords from PPT:")
print("\n".join(textwrap.wrap(", ".join(sorted(ppt_keywords)), width=80)))

print("\nKeywords from Transcription:")
print("\n".join(textwrap.wrap(", ".join(sorted(speech_keywords)), width=80)))

print(f"\n✅ Matching Keywords ({len(matches)}):")
print("\n".join(textwrap.wrap(", ".join(sorted(matches)), width=80)))

print(f"❌ Missing from Speech ({len(missing_in_speech)}):")
print("\n".join(textwrap.wrap(", ".join(sorted(missing_in_speech)), width=80)))

print(f"⚠️ Extra in Speech ({len(extra_in_speech)}):")
print("\n".join(textwrap.wrap(", ".join(sorted(extra_in_speech)), width=80)))

print(f"\n Accuracy of Coverage: {accuracy:.2f}%")

# Show graph
plot_graph(len(matches), len(missing_in_speech))



In [None]:
import os
import nltk
import string
from pptx import Presentation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# ✅ Step 1: Extract full text from PowerPoint
def extract_text_from_ppt(ppt_path):
    prs = Presentation(ppt_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + " "
            if hasattr(shape, "table"):
                for row in shape.table.rows:
                    for cell in row.cells:
                        text += cell.text + " "
    return text.strip()

# ✅ Step 2: Preprocess text into keywords (clean, lemmatize, stem)
def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    keywords = []
    for word in words:
        if word not in stop_words and word not in string.punctuation and word.isalnum():
            lemma = lemmatizer.lemmatize(word)
            stem = stemmer.stem(lemma)
            keywords.append(stem)
    return set(keywords)

# ✅ Step 3: Fuzzy match keywords with score ≥ 75
def fuzzy_match(ppt_keywords, speech_keywords, threshold=75):
    matched = set()
    missed = set(ppt_keywords)
    extra = set(speech_keywords)

    for ppt_word in ppt_keywords:
        for speech_word in speech_keywords:
            score = fuzz.ratio(ppt_word, speech_word)
            if score >= threshold:
                matched.add(ppt_word)
                missed.discard(ppt_word)
                extra.discard(speech_word)
                break

    return matched, missed, extra

# ✅ Step 4: Accuracy calculation
def calculate_accuracy(matched, total_keywords):
    if len(total_keywords) == 0:
        return 0
    return (len(matched) / len(total_keywords)) * 100

# ✅ Step 5: Plot result
def plot_results(matched, missed, extra):
    categories = ['Matched', 'Missed', 'Extra']
    values = [len(matched), len(missed), len(extra)]

    colors = ['green', 'red', 'blue']
    plt.figure(figsize=(8, 5))
    bars = plt.bar(categories, values, color=colors)
    plt.title("PPT vs. Speech Keyword Comparison")
    plt.ylabel("Keyword Count")

    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 1, int(yval), ha='center', fontsize=12)

    plt.tight_layout()
    plt.show()

# ✅ Step 6: Full run function
def run_keyword_match(ppt_path, speech_path):
    if not os.path.exists(ppt_path) or not os.path.exists(speech_path):
        print("❌ PPT or transcription file not found.")
        return

    ppt_text = extract_text_from_ppt(ppt_path)
    with open(speech_path, "r", encoding="utf-8") as f:
        speech_text = f.read()

    ppt_keywords = preprocess_text(ppt_text)
    speech_keywords = preprocess_text(speech_text)

    matched, missed, extra = fuzzy_match(ppt_keywords, speech_keywords)

    accuracy = calculate_accuracy(matched, ppt_keywords)
    print(f"\n✅ Total PPT Keywords: {len(ppt_keywords)}")
    print(f"✅ Matched Keywords: {len(matched)}")
    print(f"❌ Missed from PPT: {len(missed)}")
    print(f"🗣️ Extra from Speech: {len(extra)}")
    print(f"📈 Accuracy: {accuracy:.2f}%")

    plot_results(matched, missed, extra)

# 📁 File paths
ppt_file = "phase 1 ppt.pptx"
speech_file = "transcription.txt"

run_keyword_match(ppt_file, speech_file)
