In [2]:
import os
import speech_recognition as sr
import pyttsx3
import torch
import transformers
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import sounddevice as sd
import soundfile as sf
import numpy as np
from scipy.io import wavfile
import librosa
import tensorflow as tf

class AIInterviewVoiceSystem:
    def __init__(self):
        # Text-to-Speech Engine (Free, Local)
        self.tts_engine = pyttsx3.init()
        
        # Speech Recognition (Free, Local/Cloud)
        self.recognizer = sr.Recognizer()
        
        # Advanced Speech Recognition Model (Free, Transformer-based)
        self.speech_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        self.speech_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        
        # Emotion Detection (Using pre-trained models)
        self.emotion_model = self._load_emotion_model()
    
    def _load_emotion_model(self):
        """
        Load a simple emotion detection model using transfer learning
        This is a basic implementation that can be enhanced
        """
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(128,)),  # Feature vector input
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(5, activation='softmax')  # 5 emotion classes
        ])
        # Note: You'd typically load pre-trained weights here
        return model
    
    def speak_question(self, question):
        """
        Convert text question to speech
        Uses pyttsx3 for local, free text-to-speech
        """
        try:
            # Configure voice properties
            self.tts_engine.setProperty('rate', 150)  # Speaking rate
            self.tts_engine.setProperty('volume', 0.9)  # Volume level
            
            # Speak the question
            self.tts_engine.say(question)
            self.tts_engine.runAndWait()
            return True
        except Exception as e:
            print(f"Error in text-to-speech: {e}")
            return False
    
    def record_answer(self, duration=10):
        """
        Record candidate's answer
        Uses sounddevice for audio recording
        """
        print("Please speak your answer...")
        
        # Record audio
        fs = 44100  # Sample rate
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
        sd.wait()  # Wait until recording is finished
        
        # Save the recording
        output_filename = "candidate_answer.wav"
        sf.write(output_filename, recording, fs)
        
        return output_filename
    
    def transcribe_answer(self, audio_file):
        """
        Transcribe recorded answer using multiple methods
        """
        # Method 1: SpeechRecognition (supports multiple engines)
        try:
            with sr.AudioFile(audio_file) as source:
                audio = self.recognizer.record(source)
                
                # Google Recognition (requires internet)
                google_transcript = self.recognizer.recognize_google(audio)
                
                # Sphinx (offline recognition)
                sphinx_transcript = self.recognizer.recognize_sphinx(audio)
                
                return {
                    'google_transcript': google_transcript,
                    'sphinx_transcript': sphinx_transcript
                }
        except sr.UnknownValueError:
            print("Could not understand audio")
        except sr.RequestError:
            print("Could not request results")
        
        # Method 2: Wav2Vec2 (Transformer-based)
        try:
            # Load audio file
            input_audio, sample_rate = librosa.load(audio_file, sr=16000)
            
            # Process audio
            input_values = self.speech_processor(input_audio, sampling_rate=sample_rate, return_tensors="pt").input_values
            
            # Perform recognition
            logits = self.speech_model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = self.speech_processor.batch_decode(predicted_ids)[0]
            
            return {'transformer_transcript': transcription}
        except Exception as e:
            print(f"Transformer transcription error: {e}")
    
    def detect_emotion(self, audio_file):
        """
        Basic emotion detection from audio
        Note: This is a placeholder and requires advanced feature extraction
        """
        try:
            # Extract audio features (placeholder)
            features = self._extract_audio_features(audio_file)
            
            # Predict emotion
            emotion_prediction = self.emotion_model.predict(features)
            
            # Map prediction to emotions
            emotions = ['Neutral', 'Happy', 'Sad', 'Angry', 'Surprised']
            detected_emotion = emotions[np.argmax(emotion_prediction)]
            
            return detected_emotion
        except Exception as e:
            print(f"Emotion detection error: {e}")
            return None
    
    def _extract_audio_features(self, audio_file):
        """
        Extract features from audio file
        This is a simplified placeholder
        """
        # Use librosa for feature extraction
        y, sr = librosa.load(audio_file)
        
        # Extract some basic features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        
        # Aggregate features
        features = np.mean(mfccs.T, axis=0)
        
        return features.reshape(1, -1)  # Reshape for model input

# Example Usage
def interview_simulation():
    # Initialize the interview voice system
    interview_system = AIInterviewVoiceSystem()
    
    # Sample interview questions
    questions = [
        "Tell me about your professional experience.",
        "What motivates you to apply for this position?",
        "Describe a challenging project you've worked on."
    ]
    
    # Conduct interview simulation
    for question in questions:
        # Speak the question
        interview_system.speak_question(question)
        
        # Record answer
        answer_file = interview_system.record_answer(duration=10)
        
        # Transcribe answer
        transcripts = interview_system.transcribe_answer(answer_file)
        
        # Detect emotion
        emotion = interview_system.detect_emotion(answer_file)
        
        # Print results
        print("\nQuestion:", question)
        print("Transcripts:", transcripts)
        print("Detected Emotion:", emotion)
        print("-" * 50)

# Run the simulation
if __name__ == "__main__":
    interview_simulation()

ModuleNotFoundError: No module named 'speech_recognition'