In [37]:
#!pip install flask whisper
#! pip install git+https://github.com/openai/whisper.git -q
#!pip install ffmpeg
#!pip install openai
#!pip install transformers
#!pip install librosa pyaudio
#!pip install --force-reinstall scipy

In [38]:
import logging
import time
from flask import Flask, request, jsonify
import tempfile
import whisper
import numpy as np
import os
import librosa 
import pyaudio
# Setup Flask app
app = Flask(__name__)
 # Allow cross-origin requests

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the Whisper model once to save resources
model = whisper.load_model("base")

@app.route('/')
def index():
    return """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Real-Time Speech to Text</title>
    <style>
    body {
        font-family: 'Segoe UI', Arial, sans-serif; /* Consistent with Microsoft styling */
        background-color: #f4f6f8; /* Soothing light background */
        margin: 0;
        padding: 0;
        color: #444; /* Subtle text color for readability */
    }

    /* Container for all elements */
    .container {
        max-width: 640px;
        margin: 50px auto;
        background-color: #fff;
        padding: 20px;
        border-radius: 10px;
        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2); /* More pronounced shadow for depth */
        overflow: hidden; /* Ensures no child element leaks out */
    }

    /* Header Styles */
    .header {
        background-image: linear-gradient(135deg, #00aff0 0%, #0061a8 100%);
        color: #ffffff;
        padding: 20px;
        text-align: center;
        font-size: 26px;
        font-weight: bold;
        border-radius: 10px 10px 0 0;
    }

    /* Call Info Section */
    .call-info {
        padding: 20px;
        display: grid;
        grid-template-columns: repeat(2, 1fr);
        gap: 10px;
        font-size: 16px;
        border-bottom: 1px solid #d0e1f9; /* Light blue border */
    }

    /* Video Call Section */
    .video-call {
        display: flex;
        justify-content: space-around;
        margin-top: 20px;
        padding: 10px;
    }

    .video-call div {
        flex: 1;
        height: 200px;
        margin: 10px;
        background: rgba(0,0,0,0.1);
        border-radius: 10px;
        background-size: cover;
        background-position: center;
        transition: transform 0.3s ease-in-out;
    }

    .video-call div:hover {
        transform: scale(1.05); /* Slight zoom on hover */
    }

    /* Transcription Box */
    .transcription {
        background-color: #e8f0fe;
        border: 1px solid #c6d8f0;
        margin-top: 20px;
        padding: 20px;
        border-radius: 8px;
        font-size: 14px;
        box-shadow: inset 0 2px 4px rgba(0,0,0,0.1);
    }

    /* Button Styles */
    .button-container {
        margin-top: 20px;
        text-align: center;
    }

    button {
        background-color: #00aff0;
        color: white;
        border: none;
        border-radius: 5px;
        padding: 10px 20px;
        font-size: 16px;
        cursor: pointer;
        transition: background-color 0.3s, box-shadow 0.3s;
    }

    button:hover {
        background-color: #0077cc;
        box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
    }
    /* Add Skype-like video call elements */
    .video-call {
        display: flex;
        flex-wrap: wrap;
        justify-content: center; /* Centering the video elements */
        margin-top: 20px;
    }
    .video-call div {
        width: 50%; /* Each video takes half the width of the container */
        height: 200px; /* Fixed height for video elements */
        background-color: #333; /* Dark background for video elements */
        border-radius: 10px; /* Rounded corners for video elements */
        margin: 10px; /* Margin around video elements for spacing */
        background-size: cover;
        background-position: center; /* Center the background images */
    }
    .video-call div:first-child {
        background-image: url('/static/Images/man1_livecall.jfif'); /* Assuming the path is correct */
    }
    .video-call div:last-child {
        background-image: url('/static/Images/man2_livecall.jfif'); /* Assuming the path is correct */
    }
</style>

</head>
<body>
    <div class="container">
        <div class="header">
            <h1>Skype Call</h1>
        </div>
        <div class="call-info">
            <div><strong>Call ID:</strong> 1234567890</div>
            <div><strong>Caller:</strong> John Doe</div>
            <div><strong>Duration:</strong> <span id="callDuration">00:00:00</span></div>
            <div><strong>Call Volume (RMS):</strong> <span id="vol">0.00</span></div>
        </div>
        <div class="video-call">
            <div></div>
            <div></div>
        </div>
        <div class="transcription">
            <p id="transcription">Transcription appears here...</p>
        </div>
        <div class="button-container">
            <button id="endCall">End Call</button>
        </div>
    </div>


    <script>
    let audioContext, microphone, mediaRecorder, audioChunks = [];
    let lastSoundTimestamp = Date.now();
    let chunking = 4000;
    const silenceThreshold = 500; // Time in milliseconds to define silence duration
    let hasSoundBeenDetected = false;  // Indicates if sound has been detected in the current chunk

    navigator.mediaDevices.getUserMedia({ audio: true })
        .then(stream => {
            audioContext = new AudioContext();
            microphone = audioContext.createMediaStreamSource(stream);
            mediaRecorder = new MediaRecorder(stream);

            mediaRecorder.ondataavailable = event => {
                if (event.data.size > 0) {
                    console.log("Audio incoming. Audio incoming. Chunk pushed. No of Chunks = "+audioChunks.length+1);
                    audioChunks.push(event.data);
                }
            };

            mediaRecorder.start(chunking); 

            const processor = audioContext.createScriptProcessor(2048, 1, 1);
            microphone.connect(processor);
            processor.connect(audioContext.destination);

            processor.onaudioprocess = function(event) {
                var input = event.inputBuffer.getChannelData(0);
                var sum = 0;
                for (var i = 0; i < input.length; ++i) {
                    sum += input[i] * input[i];
                }
                var rms = Math.sqrt(sum / input.length);
                
                document.getElementById('vol').textContent = rms;
                if (rms >= 0.01) {
                    lastSoundTimestamp = Date.now();
                    hasSoundBeenDetected = true;
                } else if ((Date.now() - lastSoundTimestamp) > silenceThreshold && hasSoundBeenDetected) {
                    console.log("Detected silence after sound, stop the recorder and prepare to send data");
                    mediaRecorder.stop();
                }
            };

            mediaRecorder.onstop = () => {
                if (hasSoundBeenDetected) {
                    sendAudioToServer();
                    hasSoundBeenDetected = false;  // Reset for the next chunk
                }
                audioChunks = []; // Clear the buffer after sending
                mediaRecorder.start(chunking); // Restart recording after processing
            };
        })
        .catch(error => console.error('Error accessing media devices.', error));

    function sendAudioToServer() {
        const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
        const formData = new FormData();
        formData.append('audio', audioBlob, 'file.wav');

        fetch('/transcribe', {
            method: 'POST',
            body: formData
        })
        .then(response => response.json())
        .then(data => {
            document.getElementById('transcription').textContent = data.transcription;
            console.log("Transcription:", data.transcription);
        })
        .catch(console.error);
    }

    document.getElementById('endCall').addEventListener('click', function() {
        // Simulate ending the call
        alert('Call ended');
        window.location.reload(); // Reload the page to reset the call
    });

    // Example of updating call duration and volume dynamically
    setInterval(function() {
        let durationElement = document.getElementById('callDuration');
        let currentTime = durationElement.textContent;
        let newTime = new Date(new Date('1970/01/01 ' + currentTime) .getTime() + 1000);
        durationElement.textContent = newTime.toTimeString().substr(0, 8);
    }, 1000);

    

</script>


</body>
</html>


            """





def format_list(lst:list):
    string = ''
    ctr = 0
    for c in lst:
        string += (c + " ")
        if(ctr > 6):
            string += ("\n")
            ctr = 0
        ctr += 1
    return string

conversation_history = []
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
    global conversation_history
    

    audio_file = request.files['audio']
    
    temp_dir = tempfile.mkdtemp(dir="D:\\DEV\\WebdevFolder\\RealEstateAI")
    try:
        # Save the audio file to a temporary file
        temp_audio_path = os.path.join(temp_dir, audio_file.filename)
        audio_file.save(temp_audio_path)

       
        result = model.transcribe(temp_audio_path)
        
        conversation_history.append( result['text'])
        conversation_history_str = format_list(conversation_history)
        return jsonify({"transcription": conversation_history_str})
    except Exception as e:
        logging.exception("An error occurred during transcription")
        
    finally:
        time.sleep(10)
        # Cleanup: Remove temporary files
        os.remove(temp_audio_path)
        os.rmdir(temp_dir)
    

if __name__ == '__main__':
    app.run(debug=False, port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
2024-05-03 22:37:41,864 - INFO - [33mPress CTRL+C to quit[0m
2024-05-03 22:37:48,985 - INFO - 127.0.0.1 - - [03/May/2024 22:37:48] "GET / HTTP/1.1" 200 -
2024-05-03 22:37:49,016 - INFO - 127.0.0.1 - - [03/May/2024 22:37:49] "[36mGET /static/Images/man1_livecall.jfif HTTP/1.1[0m" 304 -
2024-05-03 22:37:49,021 - INFO - 127.0.0.1 - - [03/May/2024 22:37:49] "[36mGET /static/Images/man2_livecall.jfif HTTP/1.1[0m" 304 -
2024-05-03 22:38:02,634 - INFO - 127.0.0.1 - - [03/May/2024 22:38:02] "POST /transcribe HTTP/1.1" 200 -
2024-05-03 22:38:22,279 - INFO - 127.0.0.1 - - [03/May/2024 22:38:22] "POST /transcribe HTTP/1.1" 200 -
2024-05-03 22:38:24,776 - INFO - 127.0.0.1 - - [03/May/2024 22:38:24] "POST /transcribe HTTP/1.1" 200 -
2024-05-03 22:38:32,246 - INFO - 127.0.0.1 - - [03/May/2024 22:38:32] "POST /transcribe HTTP/1.1" 200 -
2024-05-03 22:38:41,884 - INFO - 127.0.0.1 - - [03/May/2024 22:38:41] "POST /transcribe HTTP/1.1" 200 -
2024-05-03 22:38:47,