In [1]:
import cv2
import mediapipe as mp
import sounddevice as sd
import whisper
import numpy as np
import scipy.io.wavfile as wav
import time
import re

In [2]:
duration = 30  # seconds
audio_fs = 16000
audio_filename = "speech.wav"
model = whisper.load_model("base")
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

In [3]:
def record_video_audio(duration=30):
    cap = cv2.VideoCapture(0)
    print("Recording started... Speak now.")

    frames = []
    start_time = time.time()
    
    audio = sd.rec(int(duration * audio_fs), samplerate=audio_fs, channels=1)
    
    while time.time() - start_time < duration:
        ret, frame = cap.read()
        if not ret:
            break
        
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(image_rgb)

        if results.pose_landmarks:
            mp.solutions.drawing_utils.draw_landmarks(
                frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
        
        cv2.imshow("Recording - Press 'q' to quit early", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frames.append(frame)

    cap.release()
    cv2.destroyAllWindows()
    sd.wait()

    wav.write(audio_filename, audio_fs, audio)
    print("Recording complete.")
    return frames



In [4]:
def analyze_audio(audio_filename):
    result = model.transcribe(audio_filename)
    text = result['text']
    words = text.split()
    word_count = len(words)
    filler_words = re.findall(r'\b(uh+|um+|like|you know)\b', text.lower())
    
    wpm = word_count / (duration / 60)
    return {
        "text": text,
        "word_count": word_count,
        "filler_count": len(filler_words),
        "wpm": round(wpm, 1),
        "filler_words": filler_words
    }



In [5]:
def generate_feedback(audio_stats):
    print("\n--- FEEDBACK REPORT ---")
    print(f"Transcript: {audio_stats['text'][:100]}...")
    print(f"Total words: {audio_stats['word_count']}")
    print(f"Speech Rate: {audio_stats['wpm']} WPM")
    print(f"Filler words detected: {audio_stats['filler_count']} ({', '.join(audio_stats['filler_words'])})")

    if audio_stats['wpm'] > 140:
        print("⚠️ You are speaking a bit fast. Try slowing down.")
    elif audio_stats['wpm'] < 90:
        print("⚠️ You are speaking quite slowly. Aim for 110–130 WPM.")

    if audio_stats['filler_count'] > 3:
        print("⚠️ Consider reducing filler words for a smoother delivery.")


In [6]:
if __name__ == "__main__":
    frames = record_video_audio(duration=duration)
    audio_stats = analyze_audio(audio_filename)
    generate_feedback(audio_stats)


Recording started... Speak now.
Recording complete.

--- FEEDBACK REPORT ---
Transcript:  táis...
Total words: 1
Speech Rate: 2.0 WPM
Filler words detected: 0 ()
⚠️ You are speaking quite slowly. Aim for 110–130 WPM.
