In [15]:
#!pip install flask whisper
#! pip install git+https://github.com/openai/whisper.git -q
#!pip install ffmpeg
#!pip install openai
#!pip install transformers
#!pip install librosa pyaudio
#!pip install --force-reinstall scipy

In [None]:
import logging
import time
from flask import Flask, request, jsonify
import tempfile
import whisper
import numpy as np
import os
import librosa 
import pyaudio
# Setup Flask app
app = Flask(__name__)
 # Allow cross-origin requests

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the Whisper model once to save resources
model = whisper.load_model("base")

@app.route('/')
def index():
    return """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Real-Time Speech to Text</title>
    <style>
        /* Add Skype-like styling */
        body {
            font-family: Arial, sans-serif;
            background-color: #fff;
        }
        .container {
            max-width: 400px;
            margin: 40px auto;
            background-color: #f7f7f7;
            padding: 20px;
            border-radius: 10px;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
        }
        .header {
            background-color: #333;
            color: #fff;
            padding: 10px;
            border-bottom: 1px solid #333;
            border-radius: 10px 10px 0 0;
        }
        .header span {
            font-weight: bold;
            font-size: 18px;
        }
        .call-info {
            margin-top: 20px;
            display: flex;
            flex-wrap: wrap;
            justify-content: space-between;
        }
        .call-info span {
            font-weight: bold;
            margin-right: 10px;
            font-size: 16px;
        }
        .transcription {
            margin-top: 20px;
            padding: 20px;
            border: 1px solid #ddd;
            border-radius: 10px;
            background-color: #f9f9f9;
        }
        .button-container {
            margin-top: 20px;
            text-align: center;
        }
        .button-container button {
            padding: 10px 20px;
            border: none;
            border-radius: 10px;
            background-color: #333;
            color: #fff;
            cursor: pointer;
        }
        .button-container button:hover {
            background-color: #444;
        }
        /* Add Skype-like video call elements */
        .video-call {
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            margin-top: 20px;
        }
        .video-call div {
            width: 50%;
            height: 200px;
            background-color: #333;
            border-radius: 10px;
            margin: 10px;
        }
        .video-call div:first-child {
            background-image: url('/static/Images/man1_livecall.jfif');
            background-size: cover;
            background-position: center;
        }
        .video-call div:last-child {
            background-image: url('/static/Images/man2_livecall.jfif');
            background-size: cover;
            background-position: center;
        }

    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <span>Skype Call</span>
        </div>
        <div class="call-info">
            <span>Call ID:</span> <span>1234567890</span>
            <span>Caller:</span> <span>John Doe</span>
            <span>Duration:</span> <span>00:00:00</span>
        </div>
        <div class="video-call">
            <div></div>
            <div></div>
        </div>
        <div class="transcription">
            <p id="transcription"></p>
        </div>
        <div class="button-container">
            <button id="endCall">End Call</button>
        </div>
    </div>

    <script>
    let audioContext, microphone, mediaRecorder, audioChunks = [];
let lastSoundTimestamp = Date.now();
let chunking=4000;
const silenceThreshold = 2000; // Time in milliseconds to define silence duration

navigator.mediaDevices.getUserMedia({ audio: true })
    .then(stream => {
        audioContext = new AudioContext();
        microphone = audioContext.createMediaStreamSource(stream);
        mediaRecorder = new MediaRecorder(stream);

        // Event fired when audio data is available
        mediaRecorder.ondataavailable = event => {
            if (event.data.size > 0) {
                console.log("Audio incoming. Audio incoming. Chunk pushed");
                audioChunks.push(event.data);
            }
        };

        mediaRecorder.start(chunking); 

        // Setup a ScriptProcessorNode to detect silence
        const processor = audioContext.createScriptProcessor(2048, 1, 1);
        microphone.connect(processor);
        processor.connect(audioContext.destination);

        processor.onaudioprocess = function(event) {
            var input = event.inputBuffer.getChannelData(0);
            var sum = 0;
            for (var i = 0; i < input.length; ++i) {
                sum += input[i] * input[i];
            }
            var rms = Math.sqrt(sum / input.length);
            console.log("RMS = " + rms);
            if (rms >= 0.008) {
                lastSoundTimestamp = Date.now();
            } else if ((Date.now() - lastSoundTimestamp) > silenceThreshold && audioChunks.length) {
                // Detected silence, stop the recorder and send data
                console.log("Detected silence, stop the recorder and send data & audioChunks list length " + audioChunks.length);
                mediaRecorder.stop();
            }
        };

        // Restart recorder after sending data
        mediaRecorder.onstop = () => {
            sendAudioToServer();
            audioChunks = []; // Clear the buffer after sending
            mediaRecorder.start(chunking); // Restart recording after processing
        };
    })
    .catch(error => console.error('Error accessing media devices.', error));

function sendAudioToServer() {
    const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
    const formData = new FormData();
    formData.append('audio', audioBlob, 'file.wav');

    fetch('/transcribe', {
        method: 'POST',
        body: formData
    })
    .then(response => response.json())
    .then(data => {
        document.getElementById('transcription').textContent = data.transcription;
        console.log("Transcription:", data.transcription);
    })
    .catch(console.error);
}


    document.getElementById('endCall').addEventListener('click', () => {
        window.reload();
    });
</script>


</body>
</html>


            """





def format_list(lst:list):
    string = ''
    ctr = 0
    for c in lst:
        string += (c + " ")
        if(ctr > 6):
            string += ("\n")
            ctr = 0
        ctr += 1
    return string

conversation_history = []
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
    global conversation_history
    

    audio_file = request.files['audio']
    
    temp_dir = tempfile.mkdtemp(dir="D:\\DEV\\WebdevFolder\\RealEstateAI")
    try:
        # Save the audio file to a temporary file
        temp_audio_path = os.path.join(temp_dir, audio_file.filename)
        audio_file.save(temp_audio_path)

       
        result = model.transcribe(temp_audio_path)
        
        conversation_history.append( result['text'])
        conversation_history_str = format_list(conversation_history)
        return jsonify({"transcription": conversation_history_str})
    except Exception as e:
        logging.exception("An error occurred during transcription")
        
    finally:
        time.sleep(10)
        # Cleanup: Remove temporary files
        os.remove(temp_audio_path)
        os.rmdir(temp_dir)
    

if __name__ == '__main__':
    app.run(debug=False, port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
2024-05-03 21:59:51,258 - INFO - [33mPress CTRL+C to quit[0m


2024-05-03 22:00:05,720 - ERROR - An error occurred during transcription
Traceback (most recent call last):
  File "C:\Users\PC-User\AppData\Local\Temp\ipykernel_23388\3213918725.py", line 250, in transcribe_audio
    result = model.transcribe(temp_audio_path)
  File "d:\DEV\WebdevFolder\RealEstateAI\.venv\lib\site-packages\whisper\transcribe.py", line 279, in transcribe
    result: DecodingResult = decode_with_fallback(mel_segment)
  File "d:\DEV\WebdevFolder\RealEstateAI\.venv\lib\site-packages\whisper\transcribe.py", line 195, in decode_with_fallback
    decode_result = model.decode(segment, options)
  File "d:\DEV\WebdevFolder\RealEstateAI\.venv\lib\site-packages\torch\utils\_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "d:\DEV\WebdevFolder\RealEstateAI\.venv\lib\site-packages\whisper\decoding.py", line 824, in decode
    result = DecodingTask(model, options).run(mel)
  File "d:\DEV\WebdevFolder\RealEstateAI\.venv\lib\site-packages\torch\uti