In [1]:
import os
import torch
import glob
import json
import pandas as pd
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer
)
from jiwer import wer

In [2]:
# --- Configuration ---
TSV_PATH = "/home/sahilduwal/MajorProject/Shruti---AVSR-in-Nepali-Language-/nep-2/ne_np_female/line_index.tsv"
AUDIO_DIR = "/home/sahilduwal/MajorProject/Shruti---AVSR-in-Nepali-Language-/nep-2/ne_np_female/wavs"
PRETRAINED_MODEL = "./wav2vec2-nepali-finetuned-v1"
OUTPUT_DIR = "./wav2vec2-nepali-finetuned-v2"   
PROCESSOR_SAVE_DIR = "./wav2vec2-nepali-processor"
TOKENIZER_DIR = "./tokenizer"

In [3]:
# processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
# model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base")

In [4]:
# --- Load TSV and preprocess ---
df = pd.read_csv(TSV_PATH, sep='\t')
print("Columns in TSV:", df.columns)

df = df.rename(columns={
    df.columns[0]: "wav_filename",
    df.columns[1]: "text"
})

df["wav_filename"] = df["wav_filename"].apply(lambda x: f"{x}.wav" if not str(x).endswith(".wav") else x)
df["path"] = df["wav_filename"].apply(lambda x: os.path.join(AUDIO_DIR, x))
df = df[["path", "text"]]

Columns in TSV: Index(['nep_0258_0119737288', 'दीपा धामीको जन्म सुदूरपश्चिम नेपालको बझाङ जिल्लामा भएको हो'], dtype='object')


In [5]:
# For male-female dataset

# # --- Load all TSVs and preprocess ---
# tsv_files = glob.glob(os.path.join(TSV_PATH, "*.tsv"))

# all_dfs = []
# for file in tsv_files:
#     df = pd.read_csv(file, sep="\t", header=0)
#     print(f"Loaded {file}, shape: {df.shape}")

#     df = df.rename(columns={
#         df.columns[0]: "wav_filename",
#         df.columns[1]: "text"
#     })

#     df["wav_filename"] = df["wav_filename"].apply(lambda x: f"{x}.wav" if not str(x).endswith(".wav") else x)
#     df["path"] = df["wav_filename"].apply(lambda x: os.path.join(AUDIO_DIR, x))
#     df = df[["path", "text"]]

#     all_dfs.append(df)

# df = pd.concat(all_dfs, ignore_index=True)

In [6]:
# --- Create tokenizer from Nepali text ---
chars = set()
for text in df["text"]:
    chars.update(set(text))
chars = sorted(list(chars))
if " " not in chars:
    chars.append(" ")
vocab_dict = {c: i for i, c in enumerate(chars)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["<pad>"] = len(vocab_dict)
vocab_dict["<s>"] = len(vocab_dict)
vocab_dict["</s>"] = len(vocab_dict)
vocab_dict["<unk>"] = len(vocab_dict)

In [7]:
# Save vocab
os.makedirs(TOKENIZER_DIR, exist_ok=True)
with open(os.path.join(TOKENIZER_DIR, "vocab.json"), "w", encoding="utf-8") as f:
    import json
    json.dump(vocab_dict, f, ensure_ascii=False)

In [8]:
# Create tokenizer, feature extractor, processor
tokenizer = Wav2Vec2CTCTokenizer(
    os.path.join(TOKENIZER_DIR, "vocab.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)
processor.save_pretrained(PROCESSOR_SAVE_DIR)

[]

In [9]:
# --- Create HuggingFace dataset ---
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))
dataset = dataset.rename_column("path", "audio")

train_test = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = train_test["train"]
val_ds = train_test["test"]

In [10]:
# --- Load model ---
model = Wav2Vec2ForCTC.from_pretrained(
    PRETRAINED_MODEL,
    vocab_size=len(processor.tokenizer),
    pad_token_id=processor.tokenizer.pad_token_id,
    bos_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.pad_token_id
)

In [11]:
# --- Preprocessing function ---
def prepare(batch):
    audio = batch["audio"]["array"]
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch

train_ds = train_ds.map(prepare, remove_columns=train_ds.column_names)
val_ds = val_ds.map(prepare, remove_columns=val_ds.column_names)

Map:   0%|          | 0/1856 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

In [12]:
# --- Data collator ---
def data_collator(features):
    input_values = [f["input_values"] for f in features]
    labels = [f["labels"] for f in features]

    input_values = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(iv) for iv in input_values], batch_first=True, padding_value=0.0
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(l) for l in labels], batch_first=True, padding_value=-100
    )

    return {"input_values": input_values, "labels": labels}

In [13]:
# --- Evaluation metric ---
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {"wer": wer(label_str, pred_str)}

In [None]:
# --- Training arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    learning_rate=1e-5,
    warmup_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

In [15]:
# --- Trainer setup ---
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.tokenizer,
)


  trainer = Trainer(


In [16]:
# --- Training ---
print(f"\n✅ Running on: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}\n")
trainer.train()
metrics = trainer.evaluate()
print("\n📊 Final Evaluation Metrics:", metrics)


✅ Running on: NVIDIA GeForce GTX 1650



Step,Training Loss
50,67.9545
100,56.2295
150,69.5057
200,62.9359
250,57.2586
300,64.2016
350,57.0389
400,65.9727
450,65.0672
500,63.0847



📊 Final Evaluation Metrics: {'eval_loss': 58.99445343017578, 'eval_wer': 0.5272417707150965, 'eval_runtime': 19.494, 'eval_samples_per_second': 10.619, 'eval_steps_per_second': 5.335, 'epoch': 10.0}


In [17]:
# --- Save final model and processor ---
model.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(PROCESSOR_SAVE_DIR)
print(f"\n✅ Saved model to: {OUTPUT_DIR}")
print(f"✅ Saved processor to: {PROCESSOR_SAVE_DIR}")


✅ Saved model to: ./wav2vec2-nepali-finetuned-v2
✅ Saved processor to: ./wav2vec2-nepali-processor
