In [1]:
import pandas as pd
import numpy as np
import librosa
from nltk.tokenize import word_tokenize

# Load the processed dataset
data_path = 'datasets/processed/meld_features_updated.csv'
data = pd.read_csv(data_path)

# Function to test text preprocessing
def test_text_preprocessing():
    sample_text = "Hello! How are you doing today?"
    tokenized_text = word_tokenize(sample_text.lower())
    assert len(tokenized_text) > 0, "Tokenization failed!"
    assert "hello" in tokenized_text, "Lowercasing failed!"
    print("Text preprocessing unit test passed!")

# Function to test audio feature extraction
def test_audio_feature_extraction(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        assert mfccs.shape[0] == 13, "MFCC extraction failed!"
        print("Audio feature extraction unit test passed!")
    except Exception as e:
        print(f"Audio feature extraction test failed: {e}")

# Run tests
test_text_preprocessing()
sample_audio_file = 'datasets/raw/MELD/train/audio/dia0_utt0.wav'
test_audio_feature_extraction(sample_audio_file)

Text preprocessing unit test passed!
Audio feature extraction unit test passed!


In [10]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Check for duplicate entries
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Ensure all audio files exist
import os

audio_files_missing = []
for path in data['Audio_Path']:
    if not os.path.exists(path):
        audio_files_missing.append(path)

print(f"Number of missing audio files: {len(audio_files_missing)}")

Missing values in each column:
 Dialogue_ID           0
Utterance_ID          0
Emotion               0
Word_Count            0
Char_Count            0
Sentiment_Polarity    0
Audio_Duration        0
MFCCs                 0
Utterance             0
Clean_Utterance       0
Audio_Path            0
dtype: int64
Number of duplicate rows: 0
Number of missing audio files: 0


In [11]:
# Randomly sample some data rows
sampled_data = data.sample(5)
print("Sampled data for manual inspection:")
print(sampled_data[['Emotion', 'Utterance', 'Audio_Path']])

# Save the sample for external review
sampled_data.to_csv('datasets/metadata/quality_checks/manual_spot_check_sample.csv', index=False)
print("Sample data saved for manual inspection.")

Sampled data for manual inspection:
       Emotion                                          Utterance  \
3615   sadness                                    Looks good, uh?   
5105     anger  Hey, thats never gonna make it all the way ov...   
4270     anger       We found your fire alarm in the trash chute.   
10639  neutral             Well sensitive is important, pick him.   
7291      fear  See? Now, thats why only the little fake men ...   

                                           Audio_Path  
3615    datasets/raw/MELD/train/audio\dia412_utt4.wav  
5105   datasets/raw/MELD/train/audio\dia571_utt15.wav  
4270    datasets/raw/MELD/train/audio\dia482_utt2.wav  
10639    datasets/raw/MELD/train/audio\dia74_utt2.wav  
7291   datasets/raw/MELD/train/audio\dia809_utt15.wav  
Sample data saved for manual inspection.
