In [None]:
# Step 0: Install required packages if you run this in Colab or your local environment
# !apt-get install -y poppler-utils
# !pip install pdf2image google-cloud-vision

from google.colab import files
import os
from pdf2image import convert_from_path
from google.cloud import vision

# Step 1: Upload your PDF file
print("Upload your PDF file:")
uploaded_pdf = files.upload()
pdf_path = list(uploaded_pdf.keys())[0]

# Step 2: Upload your Google Cloud JSON key
print("Upload your Google Cloud JSON key file:")
uploaded_json = files.upload()
json_path = list(uploaded_json.keys())[0]

# Step 3: Set environment variable for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path

# Step 4: Convert PDF pages to images and extract text using Vision API
def extract_text_from_pdf(pdf_path):
    # Convert PDF to images
    pages = convert_from_path(pdf_path, dpi=300)

    # Initialize Google Vision client
    client = vision.ImageAnnotatorClient()

    slide_texts = []
    for i, page in enumerate(pages):
        image_path = f'slide_{i+1}.jpg'
        page.save(image_path, 'JPEG')

        with open(image_path, 'rb') as image_file:
            content = image_file.read()

        image = vision.Image(content=content)
        response = client.document_text_detection(image=image)

        if response.error.message:
            raise Exception(f"API Error: {response.error.message}")

        text = response.full_text_annotation.text
        slide_texts.append(text)

    return slide_texts

# Step 5: Run extraction and print results
slides_text = extract_text_from_pdf(pdf_path)

for i, text in enumerate(slides_text):
    print(f"--- Slide {i+1} Text ---")
    print(text)
    print("\n" + "="*60 + "\n")


In [None]:
from google.colab import files
import os
from google.cloud import speech_v1p1beta1 as speech
from pydub import AudioSegment

# Step 1: Upload your MP3 file
print("Upload your audio file (MP3 format):")
uploaded_audio = files.upload()
mp3_path = list(uploaded_audio.keys())[0]

# Step 2: Upload your Google Cloud JSON key
print("Upload your Google Cloud JSON key file:")
uploaded_json = files.upload()
json_path = list(uploaded_json.keys())[0]

# Step 3: Set environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path

# Step 4: Convert MP3 to WAV (mono, 16kHz)
from pydub import AudioSegment

def convert_mp3_to_wav(mp3_path, wav_path="converted_audio.wav"):
    audio = AudioSegment.from_mp3(mp3_path)
    audio = audio.set_channels(1)  # Force mono
    audio = audio.set_frame_rate(16000)  # Force 16kHz sample rate
    audio.export(wav_path, format="wav")
    return wav_path

wav_path = convert_mp3_to_wav(mp3_path)
print(f"Audio converted to WAV: {wav_path}")

# Step 5: Transcribe with speaker diarization
def transcribe_audio_with_speaker_diarization(audio_path):
    client = speech.SpeechClient()

    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_speaker_diarization=True,
        diarization_speaker_count=2,  # Adjust as needed
        enable_automatic_punctuation=True,
        model="video"
    )

    print("Transcribing audio... This may take a moment.")
    response = client.recognize(config=config, audio=audio)

    if not response.results:
        return "No transcription result found."

    result = response.results[-1]
    words_info = result.alternatives[0].words

    # Group words by speaker
    transcript_by_speaker = {}
    for word_info in words_info:
        speaker_tag = word_info.speaker_tag
        word = word_info.word
        if speaker_tag not in transcript_by_speaker:
            transcript_by_speaker[speaker_tag] = []
        transcript_by_speaker[speaker_tag].append(word)

    # Format transcript
    transcript_text = ""
    for speaker, words in transcript_by_speaker.items():
        speaker_text = " ".join(words)
        transcript_text += f"[Speaker {speaker}]: {speaker_text}\n\n"

    return transcript_text

# Run transcription
transcript = transcribe_audio_with_speaker_diarization(wav_path)
print("\n--- Transcription Result ---\n")
print(transcript)


In [18]:
import gradio as gr
import os
import tempfile
import subprocess
from google.cloud import storage, speech
from pydub import AudioSegment
import fitz  # PyMuPDF for PDF text extraction

# Google Cloud setup
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service_account.json"
GCS_BUCKET = "my-meeting"

storage_client = storage.Client()
speech_client = speech.SpeechClient()

# ======================== PDF Processing ========================
def process_pdf(file):
    try:
        with fitz.open(file.name) as doc:
            text = "\n".join(page.get_text() for page in doc)
        return text if text.strip() else "No text detected in PDF."
    except Exception as e:
        return f"Failed to process PDF: {str(e)}"

# ======================== Audio Processing ========================
def preprocess_audio_to_wav(audio_file):
    audio_path = audio_file.name
    wav_path = tempfile.mktemp(suffix=".wav")

    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1)
    audio = audio.set_frame_rate(16000)
    audio.export(wav_path, format="wav")

    return wav_path

def transcribe_audio_file(audio_file):
    try:
        wav_path = preprocess_audio_to_wav(audio_file)

        bucket = storage_client.bucket(GCS_BUCKET)
        blob_name = f"temp_audio/{os.path.basename(wav_path)}"
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(wav_path)
        gcs_uri = f"gs://{GCS_BUCKET}/{blob_name}"

        audio = speech.RecognitionAudio(uri=gcs_uri)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-GB",
            enable_automatic_punctuation=True
        )

        operation = speech_client.long_running_recognize(config=config, audio=audio)
        response = operation.result(timeout=600)

        transcript = ""
        for result in response.results:
            transcript += result.alternatives[0].transcript + "\n"

        blob.delete()

        return transcript if transcript.strip() else "No speech detected."
    except Exception as e:
        return f"Failed to transcribe audio: {str(e)}"

# ======================== Video Processing ========================
def extract_audio_with_ffmpeg(video_path):
    wav_path = tempfile.mktemp(suffix=".wav")
    cmd = f"ffmpeg -i '{video_path}' -ac 1 -ar 16000 -vn -y '{wav_path}'"
    subprocess.run(cmd, shell=True, check=True)
    return wav_path

def transcribe_video_file(video_file):
    try:
        tmp_video_path = video_file.name

        wav_path = extract_audio_with_ffmpeg(tmp_video_path)

        bucket = storage_client.bucket(GCS_BUCKET)
        blob_name = f"temp_audio/{os.path.basename(wav_path)}"
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(wav_path)
        gcs_uri = f"gs://{GCS_BUCKET}/{blob_name}"

        audio = speech.RecognitionAudio(uri=gcs_uri)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-GB",
            enable_automatic_punctuation=True
        )

        operation = speech_client.long_running_recognize(config=config, audio=audio)
        response = operation.result(timeout=600)

        transcript = ""
        for result in response.results:
            transcript += result.alternatives[0].transcript + "\n"

        blob.delete()

        return transcript if transcript.strip() else "No speech detected."
    except Exception as e:
        return f"Video transcription failed: {str(e)}"


In [None]:
from vertexai.generative_models import GenerativeModel
from vertexai import init

# Initialise Vertex AI
init(project="ai-memory-rebuilder", location="us-central1")

def summarise_text(text):
    try:
        if not text.strip():
            return "No text provided for summarisation."

        # Load Gemini Pro model
        model = GenerativeModel("gemini-1.0-pro")

        # Request summary
        response = model.generate_content(
            f"Please summarise the following text in clear British English:\n\n{text}"
        )

        return response.text.strip()
    except Exception as e:
        return f"Summarisation failed: {str(e)}"





In [None]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## Multimodal Transcriber with Gemini Summarisation")

    with gr.Tab("PDF Transcription"):
        pdf_input = gr.File(file_types=[".pdf"])
        pdf_output = gr.Textbox(label="Extracted Text", lines=20)
        pdf_summary = gr.Textbox(label="Summary", lines=10)
        pdf_button = gr.Button("Transcribe PDF")
        pdf_summarise_button = gr.Button("Summarise")

        pdf_button.click(fn=process_pdf, inputs=pdf_input, outputs=pdf_output)
        pdf_summarise_button.click(fn=summarise_text, inputs=pdf_output, outputs=pdf_summary)

    with gr.Tab("Audio Transcription"):
        audio_input = gr.File(file_types=[".mp3", ".wav"])
        audio_output = gr.Textbox(label="Transcribed Text", lines=20)
        audio_summary = gr.Textbox(label="Summary", lines=10)
        audio_button = gr.Button("Transcribe Audio")
        audio_summarise_button = gr.Button("Summarise")

        audio_button.click(fn=transcribe_audio_file, inputs=audio_input, outputs=audio_output)
        audio_summarise_button.click(fn=summarise_text, inputs=audio_output, outputs=audio_summary)

    with gr.Tab("Video Transcription"):
        video_input = gr.File(file_types=[".mp4", ".mov"])
        video_output = gr.Textbox(label="Transcribed Text", lines=20)
        video_summary = gr.Textbox(label="Summary", lines=10)
        video_button = gr.Button("Transcribe Video")
        video_summarise_button = gr.Button("Summarise")

        video_button.click(fn=transcribe_video_file, inputs=video_input, outputs=video_output)
        video_summarise_button.click(fn=summarise_text, inputs=video_output, outputs=video_summary)

if __name__ == "__main__":
    demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2a4fc6a729ea522bce.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
