In [11]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()
def transcribe(path):
  audio_file = open(path, "rb")
  transcription = client.audio.transcriptions.create(
    model="whisper-1", 
    file=audio_file, 
    response_format="text"
  )
  return (transcription)


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures

def audio_analysis(path):
    # Load the audio file
    audio_path = path
    [Fs, x] = audioBasicIO.read_audio_file(audio_path)  # Fs: Sampling rate, x: Audio signal

    # Convert to mono if the audio is stereo
    if x.ndim > 1:
        x = x.mean(axis=1)

    # Step size (25ms) and window size (50ms)
    step_size = int(0.025 * Fs)  # Convert from seconds to samples
    window_size = int(0.05 * Fs)  # Convert from seconds to samples

    # Extract short-term features
    F, feature_names = ShortTermFeatures.feature_extraction(x, Fs, window_size, step_size)


    # Extract specific features for further analysis
    energy = F[1]  # Short-term Energy
    zcr = F[0]     # Zero-Crossing Rate

    # Set a silence threshold based on energy
    silence_threshold = 0.05 * np.max(energy)  # 10% of max energy

    # Detect pauses (regions where energy is below the threshold)
    pauses = np.where(energy < silence_threshold, 1, 0)

    # Calculate the total pause duration
    total_pause_duration = np.sum(pauses) * (step_size / Fs)

    # Calculate average ZCR for voiced regions (where energy > silence threshold)
    voiced_regions = energy > silence_threshold
    average_zcr = np.mean(zcr[voiced_regions])

    # Create a time axis for plotting
    time = np.arange(F.shape[1]) * (step_size / Fs)

    # Define the ideal range for ZCR
    zcr_ideal_min = 0.01
    zcr_ideal_max = 0.1

    # Score calculation for ZCR
    if average_zcr < zcr_ideal_min:
        zcr_score = 100  # Perfect score for low ZCR
    elif average_zcr > zcr_ideal_max:
        zcr_score = 0  # Poor score for high ZCR
    else:
        zcr_score = 100 * (1 - (average_zcr - zcr_ideal_min) / (zcr_ideal_max - zcr_ideal_min))

    total_speech_duration = len(x) / Fs  # Length of audio signal divided by sampling rate

    # Define the ideal range for pause percentage
    pause_ideal_min = 0  # No pauses
    pause_ideal_max = 100  # shit speech
    pause_time_percentage = (1 - total_pause_duration / total_speech_duration) * 100

    # Score calculation for pause percentage
    if pause_time_percentage < pause_ideal_min:
        pause_score = 100  # Perfect score for no pauses
    elif pause_time_percentage > pause_ideal_max:
        pause_score = 0  # Poor score for too many pauses
    else:
        pause_score = 100 * (1 - (pause_time_percentage - pause_ideal_min) / (pause_ideal_max - pause_ideal_min))

    return zcr_score, pause_score


In [13]:
path = "test.wav"

In [14]:
transcribe(path)

'So, the problem wants us to give the index of two elements from the list such that the sum of the numbers at that index is equal to the provided target. Easiest way to do this is the brute force method which is run a loop in a loop to check for the sum is equal to the target. However, this solution is a O of n squared solution, but the question asks us to look for a better solution than O of n square. So, the next approach would be to use a hash map or a dictionary. We store the seen elements as the keys with the index as the values. We continue looping through the list till we find the element whose complement aka the target minus number is in the hash map. When we find such an element, we return the index of the element and its complement. We are using a hash map because the lookup of a hash map function is constant time. The solution has time complexity of O of n making it a better solution.\n'

In [None]:
avg_fluency, avg_pausing = audio_analysis(path)
print(avg_fluency, avg_pausing)