In [3]:
import psstdata
import os
import numpy as np
import yaml
# import torch
from datasets import load_dataset, Audio, load_metric
# import json
# from transformers import (Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC,
#                           TrainingArguments, Trainer)
# from data_collator_ctc_with_padding import DataCollatorCTCWithPadding

In [4]:
dataset_dict = load_dataset('csv', data_files={
    "train": '/work/van-speech-nlp/PSST-experiment/psst-csv/train_utterances_excel.csv',
    "valid": '/work/van-speech-nlp/PSST-experiment/psst-csv/valid_utterances_excel.csv',
    "test": '/work/van-speech-nlp/PSST-experiment/psst-csv/test_utterances_excel.csv'
})

In [5]:
dataset_dict['train']

Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 2298
})

In [6]:
dataset_dict['valid']

Dataset({
    features: ['utterance_id', 'session', 'test', 'prompt', 'transcript', 'correctness', 'aq_index', 'duration_frames', 'filename_old', 'filename_new'],
    num_rows: 341
})

In [10]:
# The target utterance_id to search for
target_utterance_id = 'ACWT08a-VNT07-read'

# Assuming dataset_dict is the dataset you're working with (for example, with a 'test' key)
# Iterate over the samples in 'test' to find the one with the target utterance_id
sample = None
for entry in dataset_dict['test']:
    if entry['utterance_id'] == target_utterance_id:
        sample = entry
        break  # Stop once the sample is found

# Check if the sample was found and print the result
if sample:
    print("Sample found:", sample)
else:
    print(f"Sample with utterance_id {target_utterance_id} not found.")


Sample found: {'utterance_id': 'ACWT08a-VNT07-read', 'session': 'ACWT08a', 'test': 'VNT', 'prompt': 'read', 'transcript': 'Y AH Y EH S AH HH AH Y EH S AH Y AE', 'correctness': False, 'aq_index': 40.5, 'duration_frames': 65487, 'filename_old': 'audio/vnt/ACWT08a/ACWT08a-VNT07-read.wav', 'filename_new': '/work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/test/audio/vnt/ACWT08a/ACWT08a-VNT07-read.wav'}


In [10]:
import random

# Convert dataset to a list (if itâ€™s a Dataset object)
train_data = list(dataset_dict['train'])

# Randomly select 5 samples
random_samples = random.sample(train_data, 5)

# Display the selected samples
for sample in random_samples:
    print(sample)

{'utterance_id': 'williamson04a-VNT18-throw', 'session': 'williamson04a', 'test': 'VNT', 'prompt': 'throw', 'transcript': 'W AH AH M <sil> AH M', 'correctness': False, 'aq_index': 70.6, 'duration_frames': 87840, 'filename_old': 'audio/vnt/williamson04a/williamson04a-VNT18-throw.wav', 'filename_new': '/work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/vnt/williamson04a/williamson04a-VNT18-throw.wav'}
{'utterance_id': 'thompson11a-BNT09-cactus', 'session': 'thompson11a', 'test': 'BNT', 'prompt': 'cactus', 'transcript': 'K AE K T D AH S', 'correctness': True, 'aq_index': 81.4, 'duration_frames': 16159, 'filename_old': 'audio/bnt/thompson11a/thompson11a-BNT09-cactus.wav', 'filename_new': '/work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/thompson11a/thompson11a-BNT09-cactus.wav'}
{'utterance_id': 'thompson08a-BNT02-comb', 'session': 'thompson08a', 'test': 'BNT', 'prompt': 'comb', 'transcript': 'K OW M', 'correctness':

In [7]:
from IPython.display import Audio

# Specify the path to the audio file
audio_path = '/work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT01-house.wav'

# Play the audio file
Audio(audio_path)

In [9]:
import os
import json
from datasets import load_dataset

# Load the dataset
dataset_dict = load_dataset('csv', data_files={
    "train": '/work/van-speech-nlp/PSST-experiment/psst-csv/train_utterances_excel.csv',
    "valid": '/work/van-speech-nlp/PSST-experiment/psst-csv/valid_utterances_excel.csv',
    "test": '/work/van-speech-nlp/PSST-experiment/psst-csv/test_utterances_excel.csv'
})

# Function to process dataset and save as JSONL file
def process_psst_dataset_and_save_jsonl(dataset, dataset_name, output_dir):
    """Process the PSST dataset and save it as a JSONL file."""
    
    # Initialize a list to store the processed data
    data = []
    
    # Iterate through all examples in the dataset
    for example in dataset:
        utterance_id = example['utterance_id']
        transcript = example['transcript']
        filename_new = example['filename_new']
        
        # Check if the file exists
        if os.path.exists(filename_new):
            print(f"File exists: {filename_new}")
            
            # Prepare the JSONL entry with specified fields
            jsonl_entry = {
                "key": utterance_id,
                "source": filename_new,
                "target": transcript
            }
            data.append(jsonl_entry)
        else:
            print(f"File does not exist: {filename_new}")
    
    # Define output path and create the directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    jsonl_output_path = os.path.join(output_dir, f"{dataset_name}.jsonl")
    
    # Write data to JSONL file
    with open(jsonl_output_path, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')
    
    print(f"Processed {len(data)} entries. JSONL file saved at: {jsonl_output_path}")

# Output directory for JSONL files
output_directory = "psst"

# Process each dataset and save as JSONL
process_psst_dataset_and_save_jsonl(dataset_dict['train'], "train", output_directory)
process_psst_dataset_and_save_jsonl(dataset_dict['valid'], "valid", output_directory)
process_psst_dataset_and_save_jsonl(dataset_dict['test'], "test", output_directory)

File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT01-house.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT02-comb.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT03-toothbrush.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT04-octopus.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT05-bench.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT06-volcano.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst-data/psst-data-2022-03-02-full/train/audio/bnt/ACWT02a/ACWT02a-BNT07-canoe.wav
File exists: /work/van-speech-nlp/PSST-experiment/psst