In [None]:
# Step 1: Install dependencies
!pip install gradio openai-whisper torch pydub
!apt-get install ffmpeg -y

In [None]:
# Final version
import os
import whisper
from pydub import AudioSegment
from difflib import SequenceMatcher
import torch
import gradio as gr

# Step 2: Load the Whisper model
try:
    model = whisper.load_model("base")
    print("Whisper model loaded successfully.")
except Exception as e:
    print(f"Error loading Whisper model: {e}")

# Step 3: Audio transcription function
def transcribe_audio(file_path):
    """
    Converts the input file to .wav format if needed and transcribes it using Whisper.
    """
    try:
        # Ensure file exists
        if not os.path.exists(file_path):
            return f"File not found: {file_path}"

        # Convert audio to WAV and resample to 16 kHz
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_frame_rate(16000)  # Resample
        wav_file_path = file_path + ".wav"
        audio.export(wav_file_path, format="wav")  # Export as .wav

        # Transcribe the audio
        result = model.transcribe(wav_file_path)
        return result['text']
    except Exception as e:
        return f"Error processing audio: {e}"

# Step 4: Feedback collection function
def collect_feedback(transcription, human_transcription):
    """
    Generates feedback comparing Whisper transcription with human transcription.
    """
    diff = SequenceMatcher(None, transcription, human_transcription).get_opcodes()
    feedback = []
    for tag, i1, i2, j1, j2 in diff:
        if tag == "replace":
            feedback.append(f"Replaced '{transcription[i1:i2]}' with '{human_transcription[j1:j2]}'")
        elif tag == "delete":
            feedback.append(f"Deleted '{transcription[i1:i2]}'")
        elif tag == "insert":
            feedback.append(f"Inserted '{human_transcription[j1:j2]}'")
    return "\n".join(feedback)

# Step 5: Reward calculation function
def calculate_reward(transcription, human_transcription):
    """
    Calculates similarity reward based on shared words between transcription and human transcription.
    """
    transcription_words = set(transcription.split())
    human_words = set(human_transcription.split())
    shared_words = transcription_words & human_words
    return len(shared_words) / max(len(human_words), 1)

# Step 6: Model saving function
def save_model(output_dir):
    """
    Saves the Whisper model's configuration and weights to the specified directory.
    Since Whisper doesn't support direct fine-tuning, this demonstrates saving.
    """
    os.makedirs(output_dir, exist_ok=True)
    try:
        # Save model state (simulated)
        torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
        with open(os.path.join(output_dir, "config.json"), "w") as f:
            f.write(model.config.to_json_string())
        print(f"Model state and configuration saved to {output_dir}")
    except Exception as e:
        print(f"Error saving model: {e}")

# Step 7: Gradio app
def main():
    with gr.Blocks() as app:
        gr.Markdown("### Whisper RLHF Workflow")

        # Input fields
        audio_input = gr.Audio(type="filepath", label="Upload Audio File")
        transcription_output = gr.Textbox(label="Whisper Transcription", interactive=False)
        human_transcription = gr.Textbox(label="Human Transcription")
        feedback_output = gr.Textbox(label="Feedback", interactive=False)
        reward_output = gr.Number(label="Reward", interactive=False)
        status_output = gr.Textbox(label="Status", interactive=False)

        # State variables
        current_transcription = gr.State()

        # Step 1: Transcribe audio
        def transcribe(file_path):
            transcription = transcribe_audio(file_path)
            return transcription, transcription

        # Step 2: Process feedback and calculate reward
        def process_feedback(transcription, human_transcription):
            feedback = collect_feedback(transcription, human_transcription)
            reward = calculate_reward(transcription, human_transcription)
            save_model("/kaggle/working/updated_whisper")  # Save updated state
            return feedback, reward, "Model state saved successfully."

        # Define button interactions
        transcribe_button = gr.Button("Transcribe Audio")
        transcribe_button.click(
            transcribe,
            inputs=[audio_input],
            outputs=[transcription_output, current_transcription],
        )

        feedback_button = gr.Button("Submit Feedback")
        feedback_button.click(
            process_feedback,
            inputs=[current_transcription, human_transcription],
            outputs=[feedback_output, reward_output, status_output],
        )

    app.launch()

main()
