In [None]:
import os

# Change to a specific folder in your Google Drive
os.chdir('/external4/datasets/Graamvaani_hindi/GV_Train_100h')

# Verify the current directory
print("Current Directory:", os.getcwd())


In [None]:
!wget https://www.openslr.org/resources/118/GV_Train_100h.tar.gz

In [None]:
!tar -xzvf /content/gdrive/MyDrive/GV_Train_100h.tar.gz

In [1]:
import os
import json
from pydub import AudioSegment
from tqdm import tqdm

# Define file paths (adjust if needed)
mp3_scp_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/mp3.scp'
txt_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/text'
utt2label_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/utt2labels'
output_jsonl_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/processed_data.jsonl'

# Helper function to fix relative MP3 paths.
def fix_relative_mp3_path(mp3_path):
    if mp3_path.startswith("./Audio"):
        return mp3_path.replace("./Audio", "/external4/datasets/Graamvaani_hindi/GV_Train_100h/Audio")
    return mp3_path

# Function to convert an MP3 file path to a WAV file path.
def convert_mp3_to_wav_path(mp3_path):
    mp3_path = fix_relative_mp3_path(mp3_path)
    # Replace '/Audio/' with '/wav/' to store WAV files separately.
    wav_path = mp3_path.replace("/Audio/", "/wavs/")
    # Change file extension to .wav
    if wav_path.endswith(".mp3"):
        wav_path = wav_path[:-4] + ".wav"
    return wav_path

# ------------------------------
# 1. Parse mp3.scp file
# ------------------------------
mp3_dict = {}
with open(mp3_scp_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split(maxsplit=1)
        if len(parts) == 2:
            uttid, mp3_path = parts
            mp3_dict[uttid] = fix_relative_mp3_path(mp3_path)

# ------------------------------
# 2. Parse text file (transcriptions)
# ------------------------------
transcriptions = {}
with open(txt_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split(maxsplit=1)
        if len(parts) == 2:
            uttid, text = parts
            transcriptions[uttid] = text

# ------------------------------
# 3. Parse utt2labels file (tab-separated)
# ------------------------------
# Expected header: Uttids, Accent, Age, Gender, Background, Sentiment
utt2labels = {}
with open(utt2label_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0].strip().split('\t')
# Determine indices for required columns (excluding Background)
uttid_idx = header.index("Uttids")
accent_idx = header.index("Accent")
age_idx = header.index("Age")
gender_idx = header.index("Gender")
sentiment_idx = header.index("Sentiment")

for line in lines[1:]:
    line = line.strip()
    if not line:
        continue
    parts = line.split('\t')
    # Ensure that all required fields exist.
    if len(parts) >= max(uttid_idx, accent_idx, age_idx, gender_idx, sentiment_idx) + 1:
        uttid = parts[uttid_idx]
        accent = parts[accent_idx]
        age = parts[age_idx]
        gender = parts[gender_idx]
        sentiment = parts[sentiment_idx]
        utt2labels[uttid] = {
            "accent": accent,
            "age": age,
            "gender": gender,
            "sentiment": sentiment
        }

# ------------------------------
# 4. Process each utterance: convert MP3 to WAV, compute duration, and build JSON objects
# ------------------------------
jsonl_entries = []
wav_conversion_errors = []

for uttid, mp3_path in tqdm(mp3_dict.items(), desc="Processing Utterances"):
    wav_path = convert_mp3_to_wav_path(mp3_path)

    try:
        audio = AudioSegment.from_mp3(mp3_path)
        wav_dir = os.path.dirname(wav_path)
        os.makedirs(wav_dir, exist_ok=True)
        audio.export(wav_path, format="wav")
        duration = len(audio) / 1000.0
    except Exception as e:
        wav_conversion_errors.append((uttid, mp3_path, str(e)))
        duration = 0.0

    text = transcriptions.get(uttid, "")
    meta = utt2labels.get(uttid, {
        "accent": "NA",
        "age": "NA",
        "gender": "NA",
        "sentiment": "NA"
    })

    json_obj = {
        "path": wav_path,
        "duration": duration,
        "dialect": meta["accent"],
        "gender": meta["gender"],
        "age_group": meta["age"],
        "sentiment": meta["sentiment"],
        "text": text
    }

    jsonl_entries.append(json_obj)

# ------------------------------
# 5. Write the JSONL file (one JSON object per line)
# ------------------------------
with open(output_jsonl_file, 'w', encoding='utf-8') as out_f:
    for entry in jsonl_entries:
        out_f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Processed {len(jsonl_entries)} entries into {output_jsonl_file}")
if wav_conversion_errors:
    print("Some MP3 to WAV conversions encountered errors:")
    for err in wav_conversion_errors:
        print(err)


Processing Utterances: 100%|██████████| 37152/37152 [1:35:45<00:00,  6.47it/s]


Processed 37152 entries into /external4/datasets/Graamvaani_hindi/GV_Train_100h/processed_data.jsonl


In [3]:
import json

input_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/processed_data.jsonl'
output_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/manifest.jsonl'

with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        # Load each JSON object from the JSONL file
        record = json.loads(line)
        
        # Remove '#incomplete ' from the text field, if it exists.
        if 'text' in record:
            record['text'] = record['text'].replace('#incomplete', '')
        
        # Change sentiment value from "NA" to "neutral"
        if 'sentiment' in record and record['sentiment'] == "NA":
            record['sentiment'] = "Neutral"
        
        # Change dialect from "Accent_unknown" to "modern_hindi"
        if 'dialect' in record and record['dialect'] == "Accent_unknown":
            record['dialect'] = "modern_hindi"
        
        # Write the updated record to the output file as a JSON string
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")


In [1]:
import os
import json
import re

# Path to the JSONL file
jsonl_file = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/manifest.jsonl'  # Update this path as needed

# Path to save the intermediate JSON file
intermediate_json_path = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/intermediate_data.json'  # Update this path as needed

# Collect all records from the JSONL file
records = []

# Read the JSONL file
with open(jsonl_file, 'r', encoding='utf-8') as file:
    for line in file:
        if not line.strip():  # Skip empty lines
            continue
        try:
            data = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            continue
        # Check if all required keys exist
        required_keys = ['path', 'text', 'sentiment', 'age_group', 'gender', 'dialect']
        if all(key in data for key in required_keys):
            # Clean the text by removing annotations like [inhaling], [uhh], etc.
            cleaned_text = re.sub(r'\[.*?\]', '', data['text']).strip()
            # Replace multiple spaces with a single space
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
            records.append({
                'path': data['path'],
                'text': cleaned_text,
                'sentiment': data['sentiment'],
                'age_group': data['age_group'],
                'gender': data['gender'],
                'dialect': data['dialect']
            })
        else:
            print(f"Missing required keys in data: {data}")

# Save the records to the intermediate JSON file
os.makedirs(os.path.dirname(intermediate_json_path), exist_ok=True)
with open(intermediate_json_path, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=4)

print(f"Data extraction complete. Intermediate data saved to '{intermediate_json_path}'.")

Data extraction complete. Intermediate data saved to '/external4/datasets/Graamvaani_hindi/GV_Train_100h/intermediate_data.json'.


In [None]:
import os
import json
import openai
import time
import getpass
import csv

start_time = time.time()

openai.api_key =''

# Path to the intermediate JSON file generated by Script 1
intermediate_json_path = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/intermediate_data.json'  # Update this path as needed

# Paths to save the output files
output_json_path = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/GV_Train_100h_annotated_data.jsonl'  # This file will be in JSONL format
output_csv_path = '/external4/datasets/Graamvaani_hindi/GV_Train_100h/GV_Train_100h_annotated_data.csv'  # Update this path as needed

# Load the records from the intermediate JSON file
with open(intermediate_json_path, 'r', encoding='utf-8') as f:
    records = json.load(f)

# Initialize annotated_records
annotated_records = []
if os.path.exists(output_json_path):
    with open(output_json_path, 'r', encoding='utf-8') as f_json:
        for line in f_json:
            if line.strip():
                annotated_records.append(json.loads(line))
    print(f"Loaded {len(annotated_records)} existing annotated records.")
else:
    print("No existing annotated records found. Starting fresh.")

# Create a set of paths for already annotated records to avoid duplicates
annotated_paths = set(record['path'] for record in annotated_records)

# Total number of records
total_records = len(records)
print(f"Total sentences to process: {total_records}")

# Process records in batches
batch_size = 10  # Set batch size to 10 as per your request
total_batches = (total_records + batch_size - 1) // batch_size
print(f"Processing in batches of {batch_size} sentences.")

def format_tag(tag_type, value):
    """Format a tag string, e.g., 'INTENT_SOMETHING'."""
    return f"{tag_type}_{value.upper().replace(' ', '_').replace('-', '_').replace('/', '_')}"

def annotate_sentences(sentences):
    prompt = f'''
Annotate each Hindi sentence from the input list by marking entity phrases with start and end tags using only the provided entity types.

IMPORTANT: All instructions provided below must be followed exactly and strictly. Do not deviate from any step or add any extra text, explanations, or formatting.

Instructions:

    1.Process each sentence independently.
    2.Identify all entity phrases that match one of the provided entity types.
    3.For each identified entity phrase, insert a start tag and an end tag:
         -The start tag must be in the format: ENTITY_<ENTITY_TYPE>, where any spaces in the entity type are replaced with underscores.
         -The end tag is the literal string: END.
    4.Use only the entity types provided in the list. Do not introduce any additional entity types (for example, do not generate ENTITY_NOISE or any other tag outside the provided list).
    5.Do not add any additional text, explanations, or formatting.
    6.Return the final output as a JSON array of the annotated sentences, preserving the original order. The output must contain no markdown, formatting, or extra characters.

Entity Types:
[ "PERSON NAME", "ORGANIZATION", "LOCATION", "DATE", "TIME", "DURATION", "EMAIL", "PHONE NUMBER", "ADDRESS", "CITY", "STATE", "COUNTRY", "ZIP CODE", "CURRENCY", "PRICE", "PRODUCT", "SERVICE", "BRAND", "EVENT", "PERCENTAGE", "AGE", "TEMPERATURE", "MEASUREMENT", "DISTANCE", "WEIGHT", "HEIGHT", "VOLUME", "SPEED", "LANGUAGE", "NATIONALITY", "RELIGION", "JOB TITLE", "COMPANY NAME", "DEVICE NAME", "OPERATING SYSTEM", "SOFTWARE VERSION", "COLOR", "SHAPE", "MATERIAL", "MODEL NUMBER", "LICENSE PLATE", "VEHICLE MAKE", "VEHICLE MODEL", "VEHICLE TYPE", "FLIGHT NUMBER", "HOTEL NAME", "BOOKING REFERENCE", "PAYMENT METHOD", "CREDIT CARD NUMBER", "ACCOUNT NUMBER", "INSURANCE PROVIDER", "POLICY NUMBER", "BANK NAME", "TAX ID", "SOCIAL SECURITY NUMBER", "DRIVER'S LICENSE", "PASSPORT NUMBER", "WEBSITE", "URL", "IP ADDRESS", "MAC ADDRESS", "USERNAME", "PASSWORD", "FOOD ITEM", "DRINK ITEM", "CUISINE", "INGREDIENT", "DISH NAME", "MENU ITEM", "ORDER NUMBER", "PAYMENT AMOUNT", "DELIVERY TIME", "DELIVERY DATE", "APPOINTMENT DATE", "APPOINTMENT TIME", "ROOM NUMBER", "HOSPITAL NAME", "DOCTOR NAME", "SYMPTOM", "DIAGNOSIS", "MEDICATION", "DOSAGE", "ALLERGY", "PRESCRIPTION", "TEST NAME", "TEST RESULT", "INSURANCE PLAN", "CLAIM NUMBER", "POLICY HOLDER", "BENEFICIARY", "RELATIONSHIP", "EMERGENCY CONTACT", "PROJECT NAME", "TASK", "MEETING", "AGENDA", "ACTION ITEM", "DEADLINE", "PRIORITY", "FEEDBACK", "REVIEW", "RATING", "COMPLAINT", "QUESTION", "RESPONSE" ]

Example:
Input Sentences:
[
    "हाँ जी मैं अनीता बात कर रही हूँ आज ही मैने आपके एजेंसी से जो कैब है उसकी सुविधा प्राप्त की थी मैं उस सुविधा से बहुत ज़्यादा असंतुष्ट हूँ बहुत ज़्यादा निराश हूँ और मैं उसी के लिए आपसे शिकायत करना चाहती हूँ",
    "राजस्थान का अन्य राज्यों से सा संबंध बहोत अच्छा रहा है और राजस्थान खनिजों का अजेबघर कहा जाता है तो राजस्थान के अंदर मसाले खनिज बहोत ज्यादा तादात मे मिलते है तो इनका",
    "एक समय की बात है मैं किसी कंपनी में नौकरी कर रहा था और मैं अपने काम को बड़ी अच्छी तरह से अपना काम पूरा करता था लेकिन उसके बाद भी",
    "चेक करो कि क्या आईआरसीटीसी से पचास हज़ार रुपये का रिफ़ंड प्रोसेस हुआ है या नहीं",
    "हिंदी भाषा हमारे पुरे इतिहास में अन्य भाषाओं संस्कृतियों से बहोत प्रभावित हुई है जैसे कि हिंदी पर बहोत ही विदेशी और स्वदेशी भाषाओं का"
]

Expected Output:
[
    "हाँ जी मैं ENTITY_PERSON_NAME अनीता END बात कर रही हूँ आज ही मैंने आपके ENTITY_ORGANIZATION एजेंसी END से जो ENTITY_VEHICLE_TYPE कैब END है उसकी ENTITY_SERVICE सुविधा END प्राप्त की थी मैं उस ENTITY_SERVICE सुविधा END से बहुत ज़्यादा असंतुष्ट हूँ बहुत ज़्यादा निराश हूँ और मैं उसी के लिए आपसे ENTITY_COMPLAINT शिकायत END करना चाहती हूँ",
    "ENTITY_STATE राजस्थान END का ENTITY_STATE अन्य राज्यों END से संबंध बहोत अच्छा रहा है और ENTITY_STATE राजस्थान END ENTITY_PRODUCT खनिजों END का अजेबघर कहा जाता है तो ENTITY_STATE राजस्थान END के अंदर ENTITY_PRODUCT मसाले END ENTITY_PRODUCT खनिज END बहोत ज्यादा तादात में मिलते है तो इनका",
    "एक समय की बात है मैं किसी ENTITY_COMPANY_NAME कंपनी END में ENTITY_JOB_TITLE नौकरी END कर रहा था और मैं अपने ENTITY_TASK काम END को बड़ी अच्छी तरह से अपना ENTITY_TASK काम END पूरा करता था लेकिन उसके बाद भी",
    "चेक करो कि क्या ENTITY_ORGANIZATION आईआरसीटीसी END से ENTITY_PRICE पचास हज़ार रुपये END का ENTITY_ACTION रिफ़ंड प्रोसेस END हुआ है या नहीं ",
    "ENTITY_LANGUAGE हिंदी END भाषा हमारे पुरे इतिहास में ENTITY_LANGUAGE अन्य भाषाओं END ENTITY_LANGUAGE संस्कृतियों END से बहोत प्रभावित हुई है जैसे कि ENTITY_LANGUAGE हिंदी END पर बहोत ही ENTITY_NATIONALITY विदेशी END और ENTITY_NATIONALITY स्वदेशी END भाषाओं का"
]

Sentences to Annotate:
{json.dumps(sentences, ensure_ascii=False)}
'''
    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            n=1,
            stop=None,
            temperature=0.0,
        )
        assistant_reply = response.choices[0].message.content
        try:
            annotated_sentences = json.loads(assistant_reply)
            if isinstance(annotated_sentences, list):
                return annotated_sentences
            else:
                print("Assistant did not return a list. Reattempting individual annotations...")
                new_annotations = []
                for sentence in sentences:
                    individual_annotation = annotate_sentences([sentence])
                    if isinstance(individual_annotation, list) and len(individual_annotation) == 1:
                        new_annotations.append(individual_annotation[0])
                    else:
                        new_annotations.append(sentence)
                return new_annotations
        except json.JSONDecodeError:
            print("JSON decoding failed. Reattempting individual annotations...")
            new_annotations = []
            for sentence in sentences:
                individual_annotation = annotate_sentences([sentence])
                if isinstance(individual_annotation, list) and len(individual_annotation) == 1:
                    new_annotations.append(individual_annotation[0])
                else:
                    new_annotations.append(sentence)
            return new_annotations
    except Exception as e:
        print(f"Error annotating batch: {e}")
        return sentences

# Process each batch of records
for batch_num in range(total_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, total_records)
    batch_records = records[start_idx:end_idx]
    print(f"\nProcessing batch {batch_num + 1}/{total_batches} (sentences {start_idx + 1} to {end_idx})...")

    # Prepare lists for sentences and associated metadata
    sentences_to_annotate = []
    paths = []
    sentiments = []
    age_groups = []
    genders = []
    dialects = []

    for idx, record in enumerate(batch_records, start=start_idx + 1):
        path = record['path']
        if path in annotated_paths:
            print(f"Skipping already annotated sentence {idx}/{total_records}: {path}")
            continue

        print(f"Preparing sentence {idx}/{total_records}: {path}")
        sentences_to_annotate.append(record['text'])
        paths.append(path)
        sentiments.append(record['sentiment'])
        age_groups.append(record['age_group'])
        genders.append(record['gender'])
        dialects.append(record['dialect'])

    if not sentences_to_annotate:
        continue  # Skip if all sentences in this batch are already annotated

    # Annotate the batch using the OpenAI API
    annotated_batch_sentences = annotate_sentences(sentences_to_annotate)

    # Save annotations
    for i in range(len(annotated_batch_sentences)):
        annotated_sentence = annotated_batch_sentences[i]
        path = paths[i]
        sentiment = sentiments[i]
        age_group = age_groups[i]
        gender = genders[i]
        dialect = dialects[i]

        final_output = (f"{annotated_sentence} {format_tag('SENTIMENT', sentiment)} "
                        f"{format_tag('AGE', age_group)} {format_tag('GENDER', gender)} "
                        f"{format_tag('DIALECT', dialect)}")

        annotated_records.append({'path': path, 'Final Output': final_output})
        annotated_paths.add(path)

    # Save progress to the JSONL file (each record on a new line)
    with open(output_json_path, 'w', encoding='utf-8') as f_json:
        for record in annotated_records:
            f_json.write(json.dumps(record, ensure_ascii=False) + "\n")

    # Save progress to the CSV file
    csv_columns = ['path', 'Final Output']
    with open(output_csv_path, 'w', encoding='utf-8', newline='') as f_csv:
        writer = csv.DictWriter(f_csv, fieldnames=csv_columns)
        writer.writeheader()
        writer.writerows(annotated_records)

print("\nAnnotation complete.")
print(f"Files saved as '{os.path.basename(output_json_path)}' (JSONL) and '{os.path.basename(output_csv_path)}'.")

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time:.2f} seconds.")


No existing annotated records found. Starting fresh.
Total sentences to process: 37152
Processing in batches of 10 sentences.

Processing batch 1/3716 (sentences 1 to 10)...
Preparing sentence 1/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00073-02.wav
Preparing sentence 2/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00073-03.wav
Preparing sentence 3/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00073-01.wav
Preparing sentence 4/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00076-02.wav
Preparing sentence 5/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00077-01.wav
Preparing sentence 6/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00077-02.wav
Preparing sentence 7/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00086-01.wav
Preparing sentence 8/37152: /external4/datasets/Graamvaani_hindi/GV_Train_100h/wavs/13-00086-02.wav
Preparing sentence 9/37152