In [1]:
!python --version

Python 3.10.11


In [4]:
!pip install librosa numpy tensorflow scipy praat-parselmouth soundfile scikit-learn

In [None]:
!pip install torch librosa praat-parselmouth transformers opensmile

In [None]:
!pip install --upgrade pyannote.audio huggingface_hub

In [None]:
!pip install pyannote.audio

In [None]:
!pip install pydub soundfile matplotlib

In [28]:
#https://www.gyan.dev/ffmpeg/builds/

import librosa
import numpy as np
import parselmouth
from transformers import pipeline
import opensmile

# Add to clas
class FreeAudioAnalyzer:
    def __init__(self):
        # Initialize models
        self.emotion_model = pipeline("audio-classification", model="superb/hubert-large-superb-er")
        self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )

    def analyze_audio(self, audio_path):
        # Load audio with Librosa
        y, sr = librosa.load(audio_path, sr=16000)  # Force 16kHz sample rate
        
        # Extract features
        opensmile_features = self.smile.process_signal(y, sr)
        
        return {
            "Emotion": self._get_emotion(y, sr),
            "Transcript": self._get_transcript(y, sr),
            "Pitch": self._analyze_pitch(audio_path),
            "Speech_Rate": self._analyze_speech_rate(y, sr),
            "Energy": np.mean(librosa.feature.rms(y=y)),
            "Acoustic_Features": opensmile_features.to_dict()
        }

    def _get_emotion(self, waveform, sample_rate):
        return self.emotion_model({"array": waveform, "sampling_rate": sample_rate})[0]["label"]

    def _get_transcript(self, waveform, sample_rate):
        return self.transcriber({"array": waveform, "sampling_rate": sample_rate})["text"]

    def _analyze_pitch(self, audio_path):
        sound = parselmouth.Sound(audio_path)
        pitch = sound.to_pitch()
        return {
            "mean": np.nanmean(pitch.selected_array['frequency']),
            "std": np.nanstd(pitch.selected_array['frequency'])
        }

    def _analyze_speech_rate(self, waveform, sample_rate):
        transcript = self._get_transcript(waveform, sample_rate)
        duration = len(waveform) / sample_rate
        words_per_minute = (len(transcript.split()) / duration) * 60
        return f"{round(words_per_minute)} WPM"

def _analyze_gender(self, audio_path):
    """Gender detection using pyannote (if available)"""
    try:
        from pyannote.audio import Pipeline
        pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", 
                                           use_auth_token="YOUR_HF_TOKEN")
        diarization = pipeline(audio_path)
        return "Male" if pitch_mean < 160 else "Female"  # Simplified
    except:
        return "Unable to detect gender"

def _analyze_texture(self, audio_path):
    """Voice quality metrics"""
    sound = parselmouth.Sound(audio_path)
    pitch = sound.to_pitch()
    return {
        "jitter": sound.to_jitter(),
        "shimmer": sound.to_shimmer(),
        "hnr": self.smile.features['hnr_sma3_amean']
    }

def _analyze_style(self, y, sr):
    """Speaking style estimation"""
    return {
        "formality_score": self._formality_score(self._get_transcript(y, sr)),
        "pitch_variation": np.nanstd(librosa.pyin(y, fmin=80, fmax=400)[0])
    }


In [29]:
# Usage
analyzer = FreeAudioAnalyzer()
results = analyzer.analyze_audio(r"CrowdCompute Workbench_2.wav")
for key, value in results.items():
    print(f"{key}: {value}")

Emotion: neu
Transcript:  You know what are you saying? You're a opinion calling me in all the talent
Pitch: {'mean': 146.79491657350437, 'std': 104.53055397372134}
Speech_Rate: 147 WPM
Energy: 0.045972879976034164
Acoustic_Features: {'audspec_lengthL1norm_sma_range': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 1.6805309057235718}, 'audspec_lengthL1norm_sma_maxPos': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 0.41419142484664917}, 'audspec_lengthL1norm_sma_minPos': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 0.2904290556907654}, 'audspec_lengthL1norm_sma_quartile1': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 0.3836374580860138}, 'audspec_lengthL1norm_sma_quartile2': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 0.6434361934661865}, 'audspec_lengthL1norm_sma_quartile3': {(Timedelta('0 days 00:00:00'), Timedelta('0 days 00:00:06.134000')): 0.9914230704307556}, 

In [40]:
import librosa
import numpy as np
import parselmouth
from transformers import pipeline
import opensmile
from scipy.stats import kurtosis, skew
import os

class AudioInsightAnalyzer:
    def __init__(self):
        # Initialize models
        self.emotion_model = pipeline("audio-classification", model="superb/hubert-large-superb-er")
        self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )

    def analyze(self, audio_path):
        """Main analysis method"""
        y, sr = librosa.load(audio_path, sr=16000)
        sound = parselmouth.Sound(audio_path)
        
        return {
            "Speaker Demographics": self._analyze_demographics(y, sr, sound),
            "Voice Characteristics": self._voice_characteristics(sound, y, sr),
            "Speech Patterns": self._speech_patterns(y, sr),
            "Emotional Context": self._emotional_context(y, sr),
            "Technical Quality": self._technical_quality(y, sr),
            "Noteworthy Observations": self._noteworthy_observations(y, sr, sound)
        }

    def _analyze_demographics(self, y, sr, sound):
        # Pitch analysis with NaN handling
        pitch = sound.to_pitch()
        pitch_values = pitch.selected_array['frequency'][~np.isnan(pitch.selected_array['frequency'])]
        
        if len(pitch_values) == 0:
            return {
                "Gender": "Unknown (non-voiced/whisper)",
                "Age": "Mature",  # Default age
                "Accent": "American"  # Default accent
            }

        mean_pitch = np.mean(pitch_values)
        std_pitch = np.std(pitch_values)
        
        return {
            "Gender": self._estimate_gender(mean_pitch, std_pitch),
            "Age": self._estimate_age(sound),
            "Accent": self._detect_accent(y, sr)
        }

    def _estimate_gender(self, mean_pitch, std_pitch):
        """Sophisticated gender estimation with pitch distribution"""
        if mean_pitch > 165 or std_pitch > 40:
            return "Female" if mean_pitch > 130 else "Male"
        elif mean_pitch < 100 and std_pitch < 25:
            return "Male"
        elif 120 < mean_pitch < 180 and std_pitch > 35:
            return "Female"
        return "Unknown"

    def _estimate_age(self, sound):
        """Age estimation using formant dispersion"""
        formants = sound.to_formant_burg()
        
        try:
            f1 = np.nanmean([formants.get_value(1, t) for t in formants.xs()])
            f2 = np.nanmean([formants.get_value(2, t) for t in formants.xs()])
            f3 = np.nanmean([formants.get_value(3, t) for t in formants.xs()])
        except:
            return "Mature"
        
        dispersion = (f3 - f2) + (f2 - f1)
        
        if dispersion > 1500: return "Under 20"
        elif 1200 < dispersion <= 1500: return "20-35"
        elif 900 < dispersion <= 1200: return "35-50"
        return "50+"

    def _detect_accent(self, y, sr):
        """Accent detection using vowel space analysis"""
        features = self.smile.process_signal(y, sr)
        
        try:
            f1_mean = features['F1frequency_sma3nz_mean']
            f2_mean = features['F2frequency_sma3nz_mean']
            vowel_space = f2_mean - f1_mean
            
            if vowel_space > 1200: return "American English"
            elif 1000 < vowel_space <= 1200: return "British English"
            elif vowel_space <= 1000: return "Other"
        except:
            return "American"

    def _voice_characteristics(self, sound, y, sr):
        """Enhanced voice analysis"""
        pitch = sound.to_pitch()
        intensity = sound.to_intensity()
        harmonicity = sound.to_harmonicity()
        
        # Pitch analysis
        pitch_values = pitch.selected_array['frequency'][~np.isnan(pitch.selected_array['frequency'])]
        pitch_mean = np.mean(pitch_values) if len(pitch_values) > 0 else 0
        pitch_label = self._classify_pitch(pitch_mean)
        
        # Volume analysis
        volume_mean = np.mean(intensity.values) if intensity.values.size > 0 else 0
        projection = "Weak" if volume_mean < 60 else "Strong" if volume_mean > 80 else "Moderate"
        
        # Voice quality
        hnr = np.mean(harmonicity.values) if harmonicity.values.size > 0 else 0
        texture = "Breathy" if hnr < 10 else "Smooth" if hnr > 20 else "Neutral"
        
        return pitch_label, projection, texture

    def _classify_pitch(self, mean_pitch):
        if mean_pitch < 85: return "Very Low"
        elif 85 <= mean_pitch < 120: return "Low"
        elif 120 <= mean_pitch < 150: return "Medium"
        elif 150 <= mean_pitch < 200: return "High"
        return "Very High"

    def _speech_patterns(self, y, sr):
        """Analyze speech patterns"""
        transcript = self.transcriber({"array": y, "sampling_rate": sr}, return_timestamps=True)["text"]
        
        pace = self._calculate_pace(transcript, len(y)/sr)
        articulation = self._analyze_articulation(y, sr)
        fluency = self._assess_fluency(transcript)
        persona = self._analyze_persona(transcript)
        style = self._analyze_style(transcript)
        
        return pace, persona, style, articulation, fluency

    def _calculate_pace(self, transcript, duration):
        """Words per minute with pause analysis"""
        word_count = len(transcript.split())
        wpm = round((word_count / duration) * 60) if duration > 0 else 0
        return f"{wpm} WPM ({'Fast' if wpm > 180 else 'Slow' if wpm < 120 else 'Moderate'})"

    def _analyze_articulation(self, y, sr):
        """Articulation index using spectral clarity"""
        spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y))
        return "Clear" if spectral_flatness < 0.6 else "Mumbled"

    def _assess_fluency(self, transcript):
        """Fluency assessment using disfluency markers"""
        disfluencies = len([word for word in transcript.lower().split() 
                          if word in {'um', 'uh', 'like', 'ah'}])
        return "Fluent" if disfluencies < 3 else "Disfluent"

    def _analyze_persona(self, transcript):
        """Persona analysis based on language patterns"""
        confident_words = ['confident', 'assertive', 'certain', 'decisive']
        humble_words = ['humble', 'modest', 'unassuming', 'unpretentious']
        
        if any(word in transcript.lower() for word in confident_words):
            return "Confident"
        elif any(word in transcript.lower() for word in humble_words):
            return "Humble"
        else:
            return "Unknown"

    def _analyze_style(self, transcript):
        """Style analysis based on language patterns"""
        formal_words = ['therefore', 'hereby', 'henceforth', 'herein']
        casual_words = ['like', 'you know', 'kinda', 'sorta']
        
        if any(word in transcript.lower() for word in formal_words):
            return "Formal"
        elif any(word in transcript.lower() for word in casual_words):
            return "Casual"
        else:
            return "Unknown"

    def _emotional_context(self, y, sr):
        """Enhanced emotional analysis"""
        try:
            emotion_result = self.emotion_model({"array": y, "sampling_rate": sr})[0]
            dominant_emotion = emotion_result["label"]
            dominant_emotion = self._map_emotion_label(dominant_emotion)
        except:
            dominant_emotion = "Unknown"
        
        try:
            emotional_range = self._detect_emotional_range(y, sr)
        except:
            emotional_range = "Unknown"
        
        try:
            sentiment = self._analyze_sentiment(y, sr)
        except:
            sentiment = "Unknown"
        
        try:
            tone = self._analyze_tone(dominant_emotion)
        except:
            tone = "Unknown"
        
        try:
            intent = self._analyze_intent(dominant_emotion)
        
        except:
            intent = "Unknown"
        return dominant_emotion, emotional_range, sentiment, tone, intent
        
    def _map_emotion_label(self, label):
        """Map emotion label from model to human-readable label"""
        label_mapping = {
            "neu": "Neutral",
            "hap": "Happy",
            "ang": "Angry",
            "sad": "Sad",
            "fear": "Fearful",
            "sur": "Surprised",
            "dis": "Disgusted"
        }
        return label_mapping.get(label, "Unknown")

    def _detect_emotional_range(self, y, sr):
        """Detect emotional variability"""
        results = self.emotion_model({"array": y, "sampling_rate": sr})
        top_emotions = [res["score"] for res in results[:3]]
        return "Narrow" if np.std(top_emotions) < 0.2 else "Broad"

    def _analyze_sentiment(self, y, sr):
        """Sentiment analysis through acoustic features"""
        features = self.smile.process_signal(y, sr)
        try:
            pitch_std = features['F0final_sma_stddev']
            intensity_var = features['Loudness_sma3_stddev']
            if pitch_std > 40 and intensity_var > 8:
                return "Negative"
            elif pitch_std < 25 and intensity_var < 5:
                return "Positive"
            return "Neutral"
        except:
            return "Unknown"

    def _analyze_tone(self, emotion):
        """Tone analysis based on dominant emotion"""
        positive_emotions = ['happy', 'excited', 'joyful']
        negative_emotions = ['angry', 'sad', 'fearful']
        
        if emotion in positive_emotions:
            return "Upbeat"
        elif emotion in negative_emotions:
            return "Tense"
        else:
            return "Calm"

    def _analyze_intent(self, emotion):
        """Intent analysis based on dominant emotion"""
        persuasive_emotions = ['angry', 'excited', 'joyful']
        informative_emotions = ['neutral', 'calm', 'fearful']
        
        if emotion in persuasive_emotions:
            return "Persuasive"
        elif emotion in informative_emotions:
            return "Informative"
        else:
            return "Unknown"

    def _technical_quality(self, y, sr):
        """Enhanced technical analysis"""
        clarity = self._assess_clarity(y, sr)
        noise_floor = self._calculate_noise_floor(y)
        dynamic_range = self._calculate_dynamic_range(y)
        
        return clarity, noise_floor, dynamic_range

    def _assess_clarity(self, y, sr):
        stft = np.abs(librosa.stft(y))
        contrast = librosa.feature.spectral_contrast(S=stft, sr=sr)
        return "Clear" if np.mean(contrast) > 5 else "Muffled"

    def _calculate_noise_floor(self, y):
        rms = librosa.feature.rms(y=y)[0]
        return f"{np.percentile(rms, 10):.1f} dBFS"

    def _calculate_dynamic_range(self, y):
        return f"{np.max(y) - np.min(y):.2f} dB"

    def _noteworthy_observations(self, y, sr, sound):
        """Enhanced observations"""
        obs = []
        
        # Pitch anomalies
        pitch = sound.to_pitch()
        pitch_values = pitch.selected_array['frequency'][~np.isnan(pitch.selected_array['frequency'])]
        if len(pitch_values) > 0:
            if np.max(pitch_values) > 300:
                obs.append("Extreme high pitches detected")
            if np.min(pitch_values) < 70:
                obs.append("Very low pitches detected")
        
        # Temporal features
        tempo = librosa.beat.tempo(y=y, sr=sr)[0]
        if tempo > 160:
            obs.append("Rapid speech tempo")
        elif tempo < 80:
            obs.append("Slow speech tempo")
            
        return obs if obs else None

def print_insights(results, filename):
    """Formatted output"""
    print(f"\n{os.path.basename(filename)}")
    speaker_demographics = results["Speaker Demographics"]
    voice_characteristics = results["Voice Characteristics"]
    speech_patterns = results["Speech Patterns"]
    emotional_context = results["Emotional Context"]
    technical_quality = results["Technical Quality"]
    observations = results["Noteworthy Observations"]
    
    gender, age, accent = speaker_demographics.values()
    pitch, projection, texture = voice_characteristics
    pace, persona, style, articulation, fluency = speech_patterns
    emotion, range_, sentiment, tone, intent = emotional_context
    clarity, noise_floor, dynamic_range = technical_quality
    
    print(f"Gender: [{gender}]. Age: [{age}]. Accent: [{accent}]. Pitch: [{pitch}]. Projection: [{projection}]. Texture: [{texture}]. Pace: [{pace}]. Persona: [{persona}]. Style: [{style}]. Emotion: [{emotion}]. Tone: [{tone}]. Intent: [{intent}]. Acoustic Quality: [{clarity}]. Noteworthy (optional): [{observations}].")
    

In [33]:
# Usage
if __name__ == "__main__":
    analyzer = AudioInsightAnalyzer()
    audio_file = "ENUS_AUDIO_4.wav"
    results = analyzer.analyze(audio_file)
    print_insights(results, audio_file)


ENUS_AUDIO_4.wav
Gender: [Male]. Age: [Mature]. Accent: [American]. Pitch: [Very Low]. Projection: [Weak]. Texture: [Breathy]. Pace: [125 WPM (Moderate)]. Persona: [Neutral]. Style: [Casual]. Emotion: [Happy]. Tone: [Calm]. Intent: [Unknown]. Acoustic Quality: [Clear]. Noteworthy (optional): [['Very low pitches detected']].


# Separate - New - Effective

In [None]:
!pip install speechbrain


In [38]:
from speechbrain.pretrained import EncoderClassifier

from transformers import pipeline
import librosa
import numpy as np

import logging

# Set logging level to ERROR or higher to suppress INFO messages
logging.getLogger('speechbrain').setLevel(logging.ERROR)

def identify_accent(audio_path):
    # Load the model
    model = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa")
    
    try:
        # Get the prediction
        predictions, scores, indices, labels = model.classify_file(audio_path)
        
        # Extract the highest confidence score and convert to percentage
        confidence = float(scores[0]) * 100
        
        # Get the predicted accent (convert to uppercase)
        accent = labels[0].upper()
        
        # Print formatted result
        print(f"{accent} ({confidence:.2f}%)")
        
    except Exception as e:
        print("Error:", e)

def identify_gender(audio_file):
    # Load the pipeline
    pipe = pipeline("audio-classification", model="alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech")
    
    # Load and process audio file
    data, sampling_rate = librosa.load(audio_file, sr=None)
    data = librosa.to_mono(data)
    input_values = librosa.resample(np.asarray(data), orig_sr=sampling_rate, target_sr=16000)
    
    # Get classification
    output = pipe(input_values)
    
    # Format the output
    female_score = next(x['score'] for x in output if x['label'] == 'female')
    male_score = next(x['score'] for x in output if x['label'] == 'male')
    
    # Determine dominant gender and order scores
    if male_score > female_score:
        dominant_gender = 'Male'
        first_letter, first_score = 'M', male_score
        second_letter, second_score = 'F', female_score
    else:
        dominant_gender = 'Female'
        first_letter, first_score = 'F', female_score
        second_letter, second_score = 'M', male_score
    
    # Print formatted result with dominant score first
    print(f"{dominant_gender} ({first_letter}: {first_score:.2%}, {second_letter}: {second_score:.2%})")

# Usage
audio_path = "CrowdCompute Workbench_2.wav"
identify_accent(audio_path)
identify_gender(audio_path)

US (63.69%)
Female (F: 60.74%, M: 39.26%)


In [39]:
# Usage
if __name__ == "__main__":
    analyzer = AudioInsightAnalyzer()
    audio_file = "ENUS_AUDIO_4.wav"
    results = analyzer.analyze(audio_file)
    print_insights(results, audio_file)
    identify_accent(audio_file)
    identify_gender(audio_file)


ENUS_AUDIO_4.wav
Gender: [Male]. Age: [Mature]. Accent: [American]. Pitch: [Very Low]. Projection: [Weak]. Texture: [Breathy]. Pace: [125 WPM (Moderate)]. Persona: [Neutral]. Style: [Casual]. Emotion: [Happy]. Tone: [Calm]. Intent: [Unknown]. Acoustic Quality: [Clear]. Noteworthy (optional): [['Very low pitches detected']].
US (72.90%)
Male (M: 99.00%, F: 1.00%)


In [None]:
# Usage
if __name__ == "__main__":
    analyzer = AudioInsightAnalyzer()
    audio_file = "CrowdCompute Workbench_2.wav"
    results = analyzer.analyze(audio_file)
    print_insights(results, audio_file)
    identify_accent(audio_file)
    # identify_gender(audio_file)


CrowdCompute Workbench_2.wav
Gender: [Female]. Age: [Mature]. Accent: [American]. Pitch: [Medium]. Projection: [Moderate]. Texture: [Breathy]. Pace: [196 WPM (Fast)]. Persona: [Neutral]. Style: [Casual]. Emotion: [Neutral]. Tone: [Calm]. Intent: [Unknown]. Acoustic Quality: [Clear]. Noteworthy (optional): [['Extreme high pitches detected', 'Very low pitches detected']].
US (63.69%)
Female
(F: 60.74%, M: 39.26%)
