In [None]:
!sudo apt-get install ffmpeg

In [9]:
# !sudo apt update
# !sudo apt install ffmpeg

In [None]:
%pip install git+https://github.com/openai/whisper.git sentence_transformers pydub transformers tqdm

In [None]:
# Based on the device present, change the comment statement. In general CUDA is preferred because of its extremely fast processing.
device = "cuda"
device = "cpu"

# S1

In [None]:
import os
import pandas as pd
from tqdm import tqdm

from pydub import AudioSegment
import whisper
from sentence_transformers import SentenceTransformer



In [None]:
# Path to the dataset folder
dataset_folder = "./Dataset"
output_folder = "./audio_chunks"
os.makedirs(output_folder, exist_ok=True)

# Initialize a list to store the audio mapping details
mapping = []
counter = 0

# Process each audio file in the dataset folder
for audio_file in tqdm(os.listdir(dataset_folder)):
    audio_no = audio_file.split('_')[1].split('.')[0]
    if audio_file.endswith((".mp3", ".wav", ".flac")):
        # Load the audio file
        audio_path = os.path.join(dataset_folder, audio_file)
        audio = AudioSegment.from_file(audio_path)

        # Split into 30-second chunks
        chunk_duration = 30 * 1000
        num_chunks = len(audio) // chunk_duration

        # Process each chunk
        for i in range(num_chunks):
            # Extract chunk
            chunk = audio[i * chunk_duration:(i + 1) * chunk_duration]
            # Define chunk path and export
            chunk_path = os.path.join(output_folder, f"audio_{audio_no}_chunk_{counter}.mp3")
            chunk.export(chunk_path, format="mp3")

            # Append details to mapping list
            mapping.append({
                "audio_no": audio_no,      # Original audio file number
                "chunk_id": counter,       # Unique integer counter for each chunk
                "chunk_path": chunk_path   # Path to the chunk file
            })
            counter += 1

100%|██████████| 70/70 [08:35<00:00,  7.37s/it]


In [None]:
# Convert mapping to a DataFrame
df_mapping = pd.DataFrame(mapping)

# Save the DataFrame to a CSV for future reference
df_mapping.to_csv("./audio_chunk_mapping.csv", index=False)
print("Audio chunk mapping CSV created successfully.")

Audio chunk mapping CSV created successfully.


In [None]:
df_mapping = pd.read_csv("./audio_chunk_mapping.csv")

In [5]:
df_mapping.shape

(1385, 3)

In [11]:
df_mapping.head()

Unnamed: 0,audio_no,chunk_id,chunk_path
0,179,0,./audio_chunks/audio_179_chunk_0.mp3
1,179,1,./audio_chunks/audio_179_chunk_1.mp3
2,179,2,./audio_chunks/audio_179_chunk_2.mp3
3,179,3,./audio_chunks/audio_179_chunk_3.mp3
4,179,4,./audio_chunks/audio_179_chunk_4.mp3


# S2

In [None]:
# Load the model
model = whisper.load_model("large", device=device)

100%|█████████████████████████████████████| 2.88G/2.88G [00:33<00:00, 91.9MiB/s]


In [None]:
# Transcribe each chunk
for index, row in tqdm(df_mapping.iterrows()):
    # Transcribe audio file
    result = model.transcribe(row['chunk_path'], task="translate")
    df_mapping.at[index, "transcription"] = result["text"]

# Save updated mapping with transcriptions
df_mapping.to_csv("./audio_chunk_mapping_with_transcription.csv", index=False)

1385it [2:00:12,  5.21s/it]


# S3

### Editing the df to embed embeddings

In [None]:
# Initialize the model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

sentences = df_mapping['transcription'].tolist()  # Adjust the slice as needed
embeddings = model.encode(sentences)

# Convert embeddings to a DataFrame with column names embedding_1 to embedding_384
embedding_df = pd.DataFrame(embeddings, columns=[f'embedding_{i+1}' for i in range(embeddings.shape[1])])

# Concatenate the embeddings DataFrame with the original DataFrame
df_mapp = pd.concat([df_mapping, embedding_df], axis=1)

# Save updated mapping with transcriptions
df_mapp.to_csv("./audio_chunk_mapping_with_transcription_embeddings.csv", index=False)

df_mapp.head()

Unnamed: 0,audio_no,chunk_id,chunk_path,transcription,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,embedding_384
0,278,0,./audio_chunks/audio_278_chunk_0.mp3,We are here to share our experiences with you...,0.043139,0.053966,0.036818,0.047007,0.024853,-0.034888,...,0.055317,0.061632,-0.106956,-0.027862,0.142631,-0.007118,-0.005574,0.028976,-0.047586,0.032412
1,278,1,./audio_chunks/audio_278_chunk_1.mp3,"We should be strong. At first, the farmers of...",-0.004163,0.00813,-0.091433,-0.040648,-0.032317,-0.024731,...,0.040692,-0.018176,-0.019196,-0.006972,0.024714,-0.039926,-0.052108,0.002863,-0.020194,0.012154
2,278,2,./audio_chunks/audio_278_chunk_2.mp3,The Raita Rao of Suttamuttala has been separa...,0.036603,0.003586,-0.061841,0.04202,0.007202,-0.035289,...,0.031426,-0.017646,-0.023406,-0.068935,0.005543,0.013447,0.038017,-0.021588,-0.012747,0.047803
3,278,3,./audio_chunks/audio_278_chunk_3.mp3,"In the village of Arelli, in survey number 59...",-0.011311,-0.010493,-0.050315,0.022067,0.023678,-0.033996,...,0.032218,0.009572,0.024307,0.062972,0.026778,-0.011161,-0.005104,0.032412,-0.048981,-0.073859
4,278,4,./audio_chunks/audio_278_chunk_4.mp3,we have done this pruning because the plant h...,0.004721,0.108863,-0.032003,-0.064745,0.04501,-0.040832,...,-0.006928,0.024223,0.053384,0.025787,-0.066563,0.009256,-0.058461,-0.018934,-0.062665,0.036947


### Directly using the sentences instead of embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

sentences = df_mapping['transcription'].tolist()
embeddings = model.encode(sentences)
result = model.transcribe(audio_path, task="translate")
embedding_query = model.encode(result['text'])

similarities = model.similarity(embeddings, embedding_query)

# S4

In [None]:
index_of_most_similar_item = int(similarities.argmax())
print(index_of_most_similar_item)

58
