In [None]:
pip install azure-cognitiveservices-speech


# =============================================================================
# PROJECT #1: SPEECH-TO-TEXT TRANSCRIBER
# Complete Follow-Along Tutorial with Checkpoints
# =============================================================================

PREREQUISITES:
1. Install: pip install azure-cognitiveservices-speech
2. Get Azure Speech Service credentials:
   - Go to: https://portal.azure.com
   - Create a "Speech Services" resource
   - Get your KEY and REGION from the resource
3. Working microphone

In [None]:

SPEECH_KEY = "YOUR_SPEECH_KEY_HERE"
SPEECH_REGION = "YOUR_REGION_HERE"  # e.g., "eastus", "westus", etc.

In [None]:
# =============================================================================
# CHECKPOINT 1: Basic Setup - Authenticate and Verify Connection
# =============================================================================

import azure.cognitiveservices.speech as speechsdk

def checkpoint_1_test_connection():
    """
    Test basic authentication and SDK setup.
    SUCCESS: Program runs without errors and prints configuration info.
    FAILURE: Check your KEY and REGION are correct.
    """
    print("=" * 60)
    print("CHECKPOINT 1: Testing Connection")
    print("=" * 60)
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    
    print(f"✓ Speech Config Created")
    print(f"  Region: {SPEECH_REGION}")
    print(f"  Key: {SPEECH_KEY[:5]}..." if SPEECH_KEY != "YOUR_SPEECH_KEY_HERE" else "  Key: NOT SET")
    print("\nIf you see this message, your setup is correct!")
    print("\n")

In [None]:

# =============================================================================
# CHECKPOINT 2: Single-Phrase Recognition - Speak Once, See Text
# =============================================================================

def checkpoint_2_single_phrase():
    """
    Recognize a single phrase from your microphone.
    
    TEST:
    1. Run this function
    2. Wait for "Speak now..."
    3. Say something (e.g., "Hello, this is a test")
    4. Wait a moment after speaking
    
    SUCCESS: You see your spoken text printed.
    FAILURE: 
    - Check microphone permissions
    - Try speaking louder/clearer
    - Verify microphone is default input device
    """
    print("=" * 60)
    print("CHECKPOINT 2: Single-Phrase Recognition")
    print("=" * 60)
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    
    # Use default microphone
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    
    # Create recognizer
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )
    
    print("Speak now... (will recognize one phrase)")
    
    # Recognize once
    result = speech_recognizer.recognize_once()
    
    # Check result
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print(f"\n✓ RECOGNIZED: {result.text}")
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print(f"\n✗ No speech could be recognized: {result.no_match_details}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation = result.cancellation_details
        print(f"\n✗ Speech Recognition canceled: {cancellation.reason}")
        if cancellation.reason == speechsdk.CancellationReason.Error:
            print(f"  Error details: {cancellation.error_details}")
    
    print("\n")


In [None]:
# =============================================================================
# CHECKPOINT 3: Continuous Recognition - Speak Naturally, See Real-Time Text
# =============================================================================

def checkpoint_3_continuous_recognition():
    """
    Continuously recognize speech until you press Enter.
    
    TEST:
    1. Run this function
    2. Start speaking naturally (multiple sentences)
    3. See text appear in real-time
    4. Press Enter when done
    
    SUCCESS: Each phrase appears as you speak.
    FAILURE: If recognition stops too early, try speaking more continuously.
    """
    print("=" * 60)
    print("CHECKPOINT 3: Continuous Recognition")
    print("=" * 60)
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )
    
    # Event handlers
    def recognized_handler(evt):
        """Called when speech is recognized"""
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print(f"RECOGNIZED: {evt.result.text}")
    
    def recognizing_handler(evt):
        """Called during recognition (partial results)"""
        print(f"RECOGNIZING: {evt.result.text}", end='\r')
    
    def canceled_handler(evt):
        """Called if recognition is canceled"""
        print(f"\nCANCELED: {evt.result.cancellation_details.reason}")
    
    # Connect event handlers
    speech_recognizer.recognized.connect(recognized_handler)
    speech_recognizer.recognizing.connect(recognizing_handler)
    speech_recognizer.canceled.connect(canceled_handler)
    
    # Start continuous recognition
    print("Starting continuous recognition...")
    print("Speak naturally. Press Enter to stop.\n")
    
    speech_recognizer.start_continuous_recognition()
    
    # Wait for user to press Enter
    input()
    
    # Stop recognition
    speech_recognizer.stop_continuous_recognition()
    print("\nRecognition stopped.")
    print("\n")

In [None]:
# =============================================================================
# CHECKPOINT 4: Enhanced Features - Save to File, Handle Audio Sources
# =============================================================================

import datetime
import os

def checkpoint_4_save_to_file():
    """
    Continuous recognition with transcript saved to file.
    
    TEST:
    1. Run this function
    2. Speak several sentences
    3. Press Enter to stop
    4. Check the generated transcript file
    
    SUCCESS: A .txt file is created with your transcription.
    """
    print("=" * 60)
    print("CHECKPOINT 4: Save Transcription to File")
    print("=" * 60)
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )
    
    # Create transcript storage
    transcript = []
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"transcript_{timestamp}.txt"
    
    def recognized_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            text = evt.result.text
            transcript.append(text)
            print(f"RECOGNIZED: {text}")
    
    def recognizing_handler(evt):
        print(f"RECOGNIZING: {evt.result.text}", end='\r')
    
    speech_recognizer.recognized.connect(recognized_handler)
    speech_recognizer.recognizing.connect(recognizing_handler)
    
    print(f"Transcription will be saved to: {filename}")
    print("Speak naturally. Press Enter to stop.\n")
    
    speech_recognizer.start_continuous_recognition()
    input()
    speech_recognizer.stop_continuous_recognition()
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Transcription - {datetime.datetime.now()}\n")
        f.write("=" * 60 + "\n\n")
        for line in transcript:
            f.write(line + "\n")
    
    print(f"\n✓ Transcription saved to: {filename}")
    print(f"  Total phrases: {len(transcript)}")
    print("\n")


def checkpoint_4_from_audio_file(audio_file_path):
    """
    Transcribe an existing audio file (WAV format recommended).
    
    TEST:
    1. Prepare a WAV audio file
    2. Run: checkpoint_4_from_audio_file("path/to/your/file.wav")
    3. See transcription
    
    SUCCESS: Audio file content is transcribed.
    NOTE: Works best with WAV files, 16kHz, 16-bit, mono.
    """
    print("=" * 60)
    print("CHECKPOINT 4: Transcribe from Audio File")
    print("=" * 60)
    
    if not os.path.exists(audio_file_path):
        print(f"✗ File not found: {audio_file_path}")
        return
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )
    
    print(f"Transcribing: {audio_file_path}\n")
    
    transcript = []
    
    def recognized_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            transcript.append(evt.result.text)
            print(f"RECOGNIZED: {evt.result.text}")
    
    speech_recognizer.recognized.connect(recognized_handler)
    
    # Start and wait for completion
    speech_recognizer.start_continuous_recognition()
    
    print("Transcribing... (will auto-stop at end of file)")
    print("Press Ctrl+C if you need to stop early.\n")
    
    try:
        # Keep alive until file is processed
        import time
        while True:
            time.sleep(0.5)
    except KeyboardInterrupt:
        pass
    finally:
        speech_recognizer.stop_continuous_recognition()
    
    print(f"\n✓ Transcription complete!")
    print(f"  Total phrases: {len(transcript)}\n")
    
    # Optionally save
    output_file = audio_file_path.replace('.wav', '_transcript.txt')
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in transcript:
            f.write(line + "\n")
    print(f"✓ Saved to: {output_file}\n")


In [None]:
# =============================================================================
# BONUS: Additional Configuration Options
# =============================================================================

def bonus_advanced_configuration():
    """
    Demonstrates advanced configuration options.
    
    These settings can improve recognition quality for specific use cases.
    """
    print("=" * 60)
    print("BONUS: Advanced Configuration")
    print("=" * 60)
    
    speech_config = speechsdk.SpeechConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION
    )
    
    # Set language (default is en-US)
    speech_config.speech_recognition_language = "en-US"
    # Other options: "en-GB", "es-ES", "fr-FR", "de-DE", etc.
    
    # Enable profanity filter
    speech_config.set_profanity(speechsdk.ProfanityOption.Masked)
    # Options: Masked, Removed, Raw
    
    # Request detailed results
    speech_config.output_format = speechsdk.OutputFormat.Detailed
    
    # Enable punctuation (usually auto-enabled)
    speech_config.enable_dictation()
    
    print("Configuration options set:")
    print(f"  Language: {speech_config.speech_recognition_language}")
    print(f"  Output Format: {speech_config.output_format}")
    print("\nYou can now use this speech_config with any recognizer.")
    print("\n")
