## Installing required libraries

In [None]:
!pip install git+https://github.com/openai/whisper.git

In [None]:
!pip install spacy

In [None]:
!pip install praat-parselmouth

In [None]:
!pip uninstall -y numpy
!pip install numpy librosa TTS transformers mediapipe opencv-python-headless

In [None]:
!pip install ffmpeg-python

In [None]:
!pip install opencv-python protobuf

In [None]:
!pip uninstall mediapipe -y
!pip install mediapipe --no-cache-dir

In [None]:
!pip install moviepy

In [None]:
!pip install edge-tts

In [None]:
!pip install --upgrade mediapipe opencv-python-headless

In [None]:
!pip install requests

## Importing required Libraries

In [None]:
import whisper
import parselmouth
from parselmouth.praat import call
import subprocess
import re
import ffmpeg
import spacy
import requests
import librosa
import numpy as np
from scipy.spatial.distance import cdist
import cv2
import mediapipe as mp
import numpy as np
import time
import os
from moviepy.editor import VideoFileClip
from collections import Counter

### There are different whisper models from tiny to large. While with increasing size complexity the accuracy of transcription increases, the time taken for transcripting the text also increases. So loading the small model of whisper for getting good transcription in less time. Basically a good trade off between time and accuracy.

In [None]:
model = whisper.load_model("small")

### This is the path for the input video that I used. One should change the path to a different one for the input video he/she will provide.

In [None]:
input_video = "/content/WIN_20241211_21_41_16_Pro.mp4"

In [None]:
def convert_video_to_audio(video_path, audio_output_path):
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_output_path, codec='pcm_s16le')

        print(f"Audio file has been saved to: {audio_output_path}")

        video.close()
        audio.close()
    except Exception as e:
        print(f"An error occurred: {e}")

video_path = input_video
audio_output_path = "audio.wav"
convert_video_to_audio(video_path, audio_output_path)


### Using the model to convert speech to text

In [None]:
result = model.transcribe("audio.wav")
transcription = result['text']

print(f"Transcription: {transcription}")

### checking for grammatical errors. Since the tool used also considers proper nouns as grammatical errors, we excluded all the proper nouns from the total errors

In [None]:
nlp = spacy.load("en_core_web_sm")

url = "https://api.languagetool.org/v2/check"

text = transcription

params = {
    'text': text,
    'language': 'en-GB',
}

response = requests.post(url, data=params)

result = response.json()

def contains_proper_noun(match, nlp):
    doc = nlp(match['context']['text'])
    for token in doc:
        if token.pos_ == 'PROPN':
            return True
    return False

print("Original Text:", text)
print("\nErrors and Suggestions:")


for match in result['matches']:
    print(f"Error: {match['message']}")
    print(f"Suggested Correction(s): {match['replacements']}")
    print(f"Context: {match['context']['text']}")

error_count = 0

for match in result['matches']:
    if not contains_proper_noun(match, nlp):
        error_count += 1

print(f"\nTotal Errors (excluding proper nouns): {error_count}")

### checking for pauses and speaking rate


In [None]:
sound = parselmouth.Sound("audio.wav")

duration = call(sound, "Get total duration")

silences = call(sound, "To TextGrid (silences)", 100, 0.1, -25, 0.1, 0.05, "silent", "sounding")

n_pauses = call(silences, "Count intervals where", 1, "is equal to", "silent")

n_words = len(transcription.split())
speaking_rate = n_words / duration

print(f"Total Duration: {duration} seconds")
print(f"Number of Pauses: {n_pauses}")
print(f"Speaking Rate: {speaking_rate} words/second")


### checking for filler words

In [None]:
filler_words = ["uh", "um", "like", "you know", "er", "ah", "so", "actually", "basically"]

pattern = r'\b(' + '|'.join(filler_words) + r')\b'

matches = re.findall(pattern, transcription, flags=re.IGNORECASE)

print(f"Filler words found: {matches}")

word_counts = Counter(matches)
print(f"Filler word counts: {word_counts}")
filler_word_count = sum(word_counts.values())
print(f"total number of filler words: {filler_word_count}")

### Checking the pronunciation by comparing original audio to the synthesised audio. Here we used edge text-to-speech model as it has more human like tone and also free of cost.

In [None]:
def synthesize_audio_from_text(text_input, output_file, gender):
    try:
        voice = "en-IN-NeerjaNeural" if gender == "female" else "en-IN-PrabhatNeural"
        command = f'edge-tts --text "{text_input}" --write-media "{output_file}" --voice "{voice}"'
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error synthesizing audio: {e}")
        return None
    return output_file

def compare_audio_files(original_audio_file, synthesized_audio_file):
    try:
        original_audio, sr_original = librosa.load(original_audio_file, sr=None)
        synthesized_audio, sr_synthesized = librosa.load(synthesized_audio_file, sr=None)

        mfcc_original = librosa.feature.mfcc(y=original_audio, sr=sr_original, n_mfcc=13)
        mfcc_synthesized = librosa.feature.mfcc(y=synthesized_audio, sr=sr_synthesized, n_mfcc=13)

        distance = cdist(mfcc_original.T, mfcc_synthesized.T, 'cosine')
        avg_distance = np.mean(distance)

        speech_rate_original = len(original_audio) / sr_original
        speech_rate_synthesized = len(synthesized_audio) / sr_synthesized

        pitch_original = librosa.yin(original_audio, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
        pitch_synthesized = librosa.yin(synthesized_audio, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
        pitch_variation_original = np.std(pitch_original)
        pitch_variation_synthesized = np.std(pitch_synthesized)

        result = {
            "avg_distance": avg_distance,
            "speech_rate_original": speech_rate_original,
            "speech_rate_synthesized": speech_rate_synthesized,
            "pitch_variation_original": pitch_variation_original,
            "pitch_variation_synthesized": pitch_variation_synthesized,
        }
        return result
    except Exception as e:
        print(f"Error comparing audio files: {e}")
        return None

def comp_pronun(transcription, gender="female"):
    transcription_text = transcription
    original_audio_input = "audio.wav"
    synthesized_audio_output = "synthesized_audio.wav"

    if not synthesize_audio_from_text(transcription_text, synthesized_audio_output, gender):
        return None

    result = compare_audio_files(original_audio_input, synthesized_audio_output)
    return result

gender = "female"
result_dict = comp_pronun(transcription, gender)
if result_dict:
    print(result_dict)

### Checking the posture, gestures and eye-contact by using open-cv and mediapipe

In [None]:
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_face_mesh = mp.solutions.face_mesh

class BodyLanguageScorer:
    def __init__(self):
        self.scores = {"posture": 0, "gestures": 0, "eye_contact": 100}
        self.gesture_count = 0
        self.eye_movement_threshold = 0.05

    def analyze(self, pose_landmarks, hand_landmarks, face_landmarks):
        # Posture analysis
        if pose_landmarks:
            left_shoulder = pose_landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER.value]
            right_shoulder = pose_landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER.value]
            shoulder_diff = abs(left_shoulder.y - right_shoulder.y)
            self.scores["posture"] = max(0, 1 - shoulder_diff) * 100

        if hand_landmarks:
            self.gesture_count += len(hand_landmarks)
            self.scores["gestures"] = self.gesture_count * 25

        if face_landmarks:
            left_eye = face_landmarks[0].landmark[33]
            right_eye = face_landmarks[0].landmark[133]
            eye_distance = np.linalg.norm([left_eye.x - right_eye.x, left_eye.y - right_eye.y])
            if eye_distance > self.eye_movement_threshold:
                self.scores["eye_contact"] = max(0, 100 - eye_distance * 100)

    def get_scores(self):
        return self.scores

def process_video_optimized(video_path):
    cap = cv2.VideoCapture(video_path)
    scorer = BodyLanguageScorer()

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_skip = max(1, total_frames // 200)

    start_time = time.time()

    with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose, \
         mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands, \
         mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:

        for frame_idx in range(0, total_frames, frame_skip):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            if not ret:
                break

            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False

            pose_results = pose.process(image)
            hands_results = hands.process(image)
            face_results = face_mesh.process(image)

            scorer.analyze(
                pose_results.pose_landmarks.landmark if pose_results.pose_landmarks else None,
                hands_results.multi_hand_landmarks,
                face_results.multi_face_landmarks
            )

    cap.release()

    scores = scorer.get_scores()
    if scores["posture"] < 60 or scores["eye_contact"] < 70:
        print("Warning: Poor posture or eye contact detected!")

    print("Final Scores:")
    print(f"Posture: {scores['posture']:.2f}")
    print(f"Gestures: {scores['gestures']:.2f}")
    print(f"Eye Contact: {scores['eye_contact']:.2f}")

video_path = input_video
process_video_optimized(video_path)


### Calculating the pronunciation score

In [None]:
def normalize(x, min_val, max_val):
    return (x - min_val) / (max_val - min_val)

def calculate_pronunciation_score(mfcc_dist,
                                  word_rate_actual, word_rate_ideal,
                                  pitch_var_actual, pitch_var_ideal,
                                  mfcc_range=(0, 5),
                                  word_rate_range=(0, 10),
                                  pitch_var_range=(0, 4000),
                                  weights=(0.5, 0.3, 0.2)):
    mfcc_norm = normalize(mfcc_dist, *mfcc_range)

    word_rate_diff = abs(word_rate_actual - word_rate_ideal)
    pitch_var_diff = abs(pitch_var_actual - pitch_var_ideal)

    word_rate_score = 1 - normalize(word_rate_diff, 0, word_rate_range[1])  # Smaller diff is better
    pitch_var_score = 1 - normalize(pitch_var_diff, 0, pitch_var_range[1])  # Smaller diff is better

    pronunciation_score = (
        weights[0] * (1 - mfcc_norm) +  # 1 - mfcc_norm since lower distance is better
        weights[1] * word_rate_score +
        weights[2] * pitch_var_score
    )

    final_score = pronunciation_score * 100
    return round(final_score, 2)

mfcc_dist = result_dict['avg_distance']
word_rate_actual = result_dict['speech_rate_original']
word_rate_ideal = result_dict['speech_rate_synthesized']
pitch_var_actual = result_dict['pitch_variation_original']
pitch_var_ideal = result_dict['pitch_variation_synthesized']

pronunciation_score = calculate_pronunciation_score(
    mfcc_dist,
    word_rate_actual, word_rate_ideal,
    pitch_var_actual, pitch_var_ideal,
    mfcc_range=(0, 5),
    word_rate_range=(0, 10),
    pitch_var_range=(0, 3500)
)

print(pronunciation_score)

### Calculating overall score

In [None]:
def calculate_accuracy_score(pronunciation_score, error_count, speaking_rate, pause_count, filler_word_count):

    weights = {
        "pronunciation": 0.4,
        "error": 0.2,
        "speaking_rate": 0.2,
        "pause": 0.1,
        "filler_word": 0.1,
    }

    normalized_pronunciation = pronunciation_score / 100

    normalized_error = max(0, 1 - error_count / 20)

    if 2 <= speaking_rate <= 4:
        normalized_speaking_rate = 1
    else:
        normalized_speaking_rate = max(0, 1 - abs(speaking_rate - 3) / 3)

    normalized_pause = max(0, 1 - pause_count / 10)

    normalized_filler = max(0, 1 - filler_word_count / 10)

    score = (
        weights["pronunciation"] * normalized_pronunciation +
        weights["error"] * normalized_error +
        weights["speaking_rate"] * normalized_speaking_rate +
        weights["pause"] * normalized_pause +
        weights["filler_word"] * normalized_filler
    )

    return round(score * 100, 2)

accuracy_score = calculate_accuracy_score(
    pronunciation_score, error_count, speaking_rate, n_pauses, filler_word_count)

print(f"Score: {accuracy_score}")