In [2]:
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.io import wavfile
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import os
import speech_recognition as sr
from itertools import product

In [19]:
repetition_wav_files = [os.path.join('audio samples/disfluency/repetition', f) for f in os.listdir('./audio samples/disfluency/repetition')]
stutter_wav_files = [os.path.join('audio samples/disfluency/stutter', f) for f in os.listdir('./audio samples/disfluency/stutter')]
pause_wav_files = [os.path.join('audio samples/disfluency/pause', f) for f in os.listdir('./audio samples/disfluency/pause')]
stutter_pause_wav_files = [os.path.join('audio samples/disfluency/stutter_pause', f) for f in os.listdir('./audio samples/disfluency/stutter_pause')]
noise_wav_files = [os.path.join('audio samples/disfluency/noise', f) for f in os.listdir('./audio samples/disfluency/noise')]

## Detecting Repetitions using Google 

In [None]:
#Initiаlize  reсоgnizer  сlаss  (fоr  reсоgnizing  the  sрeeсh)
r = sr.Recognizer()
sentence = ""
for audio in repetition_wav_files:
    #Use  the  reсоgnize_google()  funсtiоn  tо  reсоgnize  the  аudiо
    with sr.AudioFile(audio) as source:
        # print('Say  something!')
        audio = r.record(source)
        # print('Done!')
    try:
        sentence = r.recognize_google(audio)
        print('You  said :  ' + sentence)
    except Exception as  e:
        print('Error:  ' + str(e))

In [None]:
from collections import Counter
import re

# Function to split the text into words
Counter(sentence.split())

## Trial 1 : Feature Engineering in audio files (chatGPT)

In [None]:
all_disfluencies_files = repetition_wav_files + stutter_wav_files + pause_wav_files + stutter_pause_wav_files + noise_wav_files

In [None]:
def generate_spectrogram(audio_file):
    # Load audio file
    y, sr = librosa.load(audio_file)
    
    # Generate spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    
    # Convert to dB scale
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Plot spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram_db, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.show()

# put the above cell in one function 
def plot_wav(filename):
    # read the file
    samplerate, data = wavfile.read(filename)
    # get the duration
    duration = len(data)/samplerate
    # create a time variable
    time = np.arange(0,duration,1/samplerate)
    # plot amplitude (or loudness) over time
    plt.plot(time,data)
    plt.xlabel('Time [s]')
    plt.ylabel('Amplitude')
    # put a label with the file name 
    plt.title(filename)
    plt.show()


In [None]:
for audio_file in all_disfluencies_files:
    plot_wav(audio_file)

In [None]:
for audio_file in all_disfluencies_files:    
    y, sr = librosa.load(audio_file)
    # Extract energy feature
    energy = np.max(librosa.feature.rms(y=y))
    # Extract pitch feature
    pitches, _ = librosa.piptrack(y=y, sr=sr)
    pitch_mean = np.mean(pitches[pitches > 0])
    print(f"{energy=} , {pitch_mean=} , for {audio_file=}")

In [None]:
# Function to extract energy and pitch features from audio file
def extract_features(audio_file):
    y, sr = librosa.load(audio_file)
    
    # Extract energy feature
    energy = np.mean(librosa.feature.rms(y=y))
    
    # Extract pitch feature
    pitches, _ = librosa.piptrack(y=y, sr=sr)
    pitch_mean = np.mean(pitches[pitches > 0])
    
    return [energy, pitch_mean]

# Example usage: Extract features from audio files in a dataset
def extract_features_from_dataset(audio_files):
    features = []
    for file in audio_files:
        features.append(extract_features(file))
    return np.array(features)

def extract_mfcc(audio_file):
    y, sr = librosa.load(audio_file)
    # Extract MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return mfcc

def extract_mfcc_from_dataset(audio_files):
    mfccs = []
    for file in audio_files:
        mfccs.append(extract_mfcc(file))
    return mfccs

In [None]:
# Example dataset of audio files and corresponding labels
labels = ['stutter'] * len(stutter_wav_files) + ['pause'] * len(pause_wav_files) + ['st_p'] * len(stutter_pause_wav_files) + ['noise'] * len(noise_wav_files)


In [None]:
for (file,label) in zip(all_disfluencies_files,labels):
    print(f"{file=} , {label=}")

In [None]:
# Example dataset of audio files and corresponding labels

# Extract features from the dataset
X = extract_mfcc_from_dataset(all_disfluencies_files)

# Convert labels to numerical format
label_mapping = {'stutter': 1, 'pause': 2, 'st_p': 3, 'noise': 4}
y = np.array([label_mapping[label] for label in labels])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Flatten the data
X_train_flat = [mfcc.flatten() for mfcc in X_train]
X_test_flat = [mfcc.flatten() for mfcc in X_test]


In [None]:
# Train a Random Forest classifier on the mfcc features
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)

In [None]:

# Function to extract MFCC features from audio files
def extract_mfcc(audio_path, max_pad_len=100):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None)
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    # Pad or truncate MFCC features to a fixed length
    if mfccs.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

# Extract MFCC features for all audio files
max_pad_len = 100  # Maximum length of MFCC features
X = [extract_mfcc(audio_file, max_pad_len=max_pad_len) for audio_file in all_disfluencies_files]

# Convert list to numpy array
X = np.array(X)
y = np.array(labels)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Flatten MFCC features
X_train_flatten = X_train.reshape(X_train.shape[0], -1)
X_test_flatten = X_test.reshape(X_test.shape[0], -1)

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_flatten, y_train)

# Evaluate classifier
accuracy = clf.score(X_test_flatten, y_test)
print("Accuracy:", accuracy)


## Windowing Approach 1 

In [11]:
all_disfluencies_files[0]

'audio samples/disfluency/repetition\\repetition_01.wav'

In [None]:
# Load audio file
audio_path = all_disfluencies_files[0]
y, sr = librosa.load(audio_path, sr=None)

# Define parameters for windowing and feature extraction
frame_size = 2048  # Size of each frame in samples
hop_length = 512   # Hop length (frame overlap) in samples
n_mfcc = 13        # Number of MFCC coefficients to extract

# Extract MFCC features using frame-wise processing
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=frame_size, hop_length=hop_length)

# Plot MFCC features
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfccs, sr=sr, hop_length=hop_length, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.show()


In [3]:
uclass_annotation_csv_files = [os.path.join('audio samples/uclass_dataset/uclass_annotations', f) for f in os.listdir('./audio samples/uclass_dataset/uclass_annotations')]
uclass_audio_files = [os.path.join('audio samples/uclass_dataset', f) for f in os.listdir('./audio samples/uclass_dataset') if f.endswith('.wav')]

assert len(uclass_annotation_csv_files) == len(uclass_audio_files)

_Column 8:	Label classifying utterance as clean speech, or one of 6 stutter types._
-	0: 	clean
-	1: 	interjection
-	2: 	sound repetition
-	3: 	part-word repetition
-	4: 	word repetition
-	5: 	phrase repetition
-	6: 	revision
-	7: 	prolongation

In [12]:

def extract_label_from_csv(csv_file):
    df = pd.read_csv(csv_file, header=None, encoding='utf-8')
    df = df[df[7] != 0]
    category = df[7].value_counts().idxmax()
    if category == 5:
        category = 2
    elif category == 6:
        category = 1
    return category


def map_audiofiles_to_labels(audio_files, annotation_files):
    audio_to_labels = {}
    for audio_file, annotation_file in product(uclass_audio_files,uclass_annotation_csv_files):
        audio_name = audio_file.split('/')[-1].split('\\')[-1].split('.')[0]
        annotation_name = annotation_file.split('/')[-1].split('\\')[-1].split('.')[0]
        if audio_name == annotation_name:
            try:
                audio_to_labels[audio_file] = extract_label_from_csv(annotation_file)
            except Exception as e:
                print(f"Error Reading Filename: {annotation_file}")
                continue
    return audio_to_labels

def extract_mfcc_with_window(audio_and_labels, window_size=2048, hop_length=512, n_mfcc=13, max_pad_len=100):
    mfccs = []
    labels = []
    for audio_file, label in audio_and_labels.items():
        y, sr = librosa.load(audio_file, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=window_size, hop_length=hop_length)
        # pad mfccs to a fixed length
        if mfcc.shape[1] < max_pad_len:
            pad_width = max_pad_len - mfccs.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]
        mfccs.append(mfcc)
        labels.append(label)
    return np.array(mfccs) , np.array(labels)

In [18]:
audio_and_labels = map_audiofiles_to_labels(uclass_audio_files, uclass_annotation_csv_files)
# 1: stutter and pause , 2: repetition
X, y = extract_mfcc_with_window(audio_and_labels, window_size=2048, hop_length=512, n_mfcc=100, max_pad_len=40)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# we use cross-validation as the length is small 
# Flatten MFCC features
X_train_flatten = X_train.reshape(X_train.shape[0], -1)
X_test_flatten = X_test.reshape(X_test.shape[0], -1)

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X_train_flatten, y_train)

# Evaluate classifier
accuracy = clf.score(X_test_flatten, y_test)
print("Accuracy:", accuracy)

clf.predict(X_test_flatten[0].reshape(1,-1))

Accuracy: 0.5


array([2], dtype=int64)

In [21]:
test = {stutter_pause_wav_files[0] : 1}
mfcc, label = extract_mfcc_with_window(test, window_size=2048, hop_length=512, n_mfcc=100, max_pad_len=40)
clf.predict(mfcc.reshape(1,-1))

array([1], dtype=int64)

## JUST A CELL FOR GETTING COMMON FILES and DELETING THE REST


In [65]:


# common_files = set()
# for audio_file, annotation_file in product(uclass_audio_files,uclass_annotation_csv_files):
#     audio_name = audio_file.split('/')[-1].split('\\')[-1].split('.')[0]
#     annotation_name = annotation_file.split('/')[-1].split('\\')[-1].split('.')[0]
#     if audio_name == annotation_name:
#         common_files.add(audio_file)

# common_files


{'audio samples/uclass_dataset\\F_0101_10y4m_1.wav',
 'audio samples/uclass_dataset\\F_0101_13y1m_1.wav',
 'audio samples/uclass_dataset\\M_0028_15y11m_1.wav',
 'audio samples/uclass_dataset\\M_0030_12y1m_1.wav',
 'audio samples/uclass_dataset\\M_0030_17y9m_1.wav',
 'audio samples/uclass_dataset\\M_0052_16y4m_1.wav',
 'audio samples/uclass_dataset\\M_0061_14y8m_1.wav',
 'audio samples/uclass_dataset\\M_0078_12y4m_1.wav',
 'audio samples/uclass_dataset\\M_0078_16y5m_1.wav',
 'audio samples/uclass_dataset\\M_0095_07y7m_1.wav',
 'audio samples/uclass_dataset\\M_0095_08y10m_1.wav',
 'audio samples/uclass_dataset\\M_0098_07y8m_1.wav',
 'audio samples/uclass_dataset\\M_0098_09y8m_1.wav',
 'audio samples/uclass_dataset\\M_0098_10y6m_1.wav',
 'audio samples/uclass_dataset\\M_0100_11y2m_1.wav',
 'audio samples/uclass_dataset\\M_0100_12y3m_1.wav',
 'audio samples/uclass_dataset\\M_0100_13y10m_1.wav',
 'audio samples/uclass_dataset\\M_0138_12y2m_1.wav',
 'audio samples/uclass_dataset\\M_0138_13y3