In [1]:
import pandas as pd

# Specify the path to your TSV file
tsv_file_path = '/content/target.tsv'

# Use pandas to read the TSV file. Since it's a TSV, we specify the delimiter as '\t' for tab.
data_frame = pd.read_csv(tsv_file_path, delimiter='\t')

# Now `data_frame` holds your data as a DataFrame. You can view the first few rows to verify it's loaded correctly.
print(data_frame.head())


                           path  \
0  common_voice_en_39586386.mp3   
1  common_voice_en_39586596.mp3   
2  common_voice_en_39586629.mp3   
3  common_voice_en_39587000.mp3   
4  common_voice_en_39587700.mp3   

                                            sentence  
0  Well, that's not an indictment of Michael Jack...  
1  The definitive treatment for Heyde's syndrome ...  
2  Piles of stones may be seen at the partings of...  
3  Items can also be mixed together to create dif...  
4  Yahia Nader is an Egyptian player born in the ...  


In [2]:
# Use pandas to read the TSV file. Since it's a TSV, specify the delimiter as '\t' for tab.
data_frame1 = pd.read_csv(tsv_file_path, delimiter='\t', usecols=['path', 'sentence'])

# Rename the 'file_name' column in data_frame2 to 'path' to match the column name in data_frame1 for easy merging
data_frame1 = data_frame1.rename(columns={'path' : 'file_name' })

# Now `data_frame` holds only the columns for 'path' and 'sentence'. You can view the first few rows to verify.
print(data_frame1.head())

                      file_name  \
0  common_voice_en_39586386.mp3   
1  common_voice_en_39586596.mp3   
2  common_voice_en_39586629.mp3   
3  common_voice_en_39587000.mp3   
4  common_voice_en_39587700.mp3   

                                            sentence  
0  Well, that's not an indictment of Michael Jack...  
1  The definitive treatment for Heyde's syndrome ...  
2  Piles of stones may be seen at the partings of...  
3  Items can also be mixed together to create dif...  
4  Yahia Nader is an Egyptian player born in the ...  


In [3]:
# Specify the path where you want to save the new file
output_file_path = 'extracted_data_en.tsv'

# Write the DataFrame to a new CSV file
# If you want to save it as a TSV file instead, just change the file extension to '.tsv' and set sep='\t'
data_frame.to_csv(output_file_path, index=False)

print(f"Data extracted and saved to {output_file_path}")

Data extracted and saved to extracted_data_en.tsv


In [4]:
import pandas as pd

# Specify the path to your TSV file
tsv_file_path = '/content/transcriptions_3000.tsv'

# Use pandas to read the TSV file. Since it's a TSV, we specify the delimiter as '\t' for tab.
data_frame2 = pd.read_csv(tsv_file_path, delimiter='\t')
data_frame2["file_name"] = data_frame2["file_name"].str.replace("English/", "", regex=False)

# Now `data_frame` holds your data as a DataFrame. You can view the first few rows to verify it's loaded correctly.
print(data_frame2.head())

                      file_name  \
0  common_voice_en_39586557.mp3   
1  common_voice_en_39586558.mp3   
2  common_voice_en_39586559.mp3   
3  common_voice_en_39586560.mp3   
4  common_voice_en_39586561.mp3   

                                       transcription  
0  THE FOLLOWING YEAR JANIS MARIE JOHNSON RELEASE...  
1  CAULIERE LEFT TO BECOME ATHLETIC DIRECTOR AT B...  
2  SOME OF THE CITY'S MOST SPECTACULAR VIEWS ARE ...  
3         HE IS BURIED AT THE PARTANCURSION CEMETERY  
4                           THE DATIANS REBUILD THEM  


In [5]:
# Merge the two DataFrames based on the 'path' column to get corresponding 'sentence' rows from data_frame2
merged_data_frame = pd.merge(data_frame1, data_frame2, on='file_name', how='inner')

final_data_frame = merged_data_frame[['file_name', 'sentence', 'transcription']]

# If you want to write this final DataFrame to a TSV file:
final_data_frame.to_csv('joined_sentences_transcriptions.tsv', sep='\t', index=False)

print("Joining complete. The final joined data is saved in 'joined_sentences_transcriptions.tsv'")
print(final_data_frame.head())

Joining complete. The final joined data is saved in 'joined_sentences_transcriptions.tsv'
                      file_name  \
0  common_voice_en_39586596.mp3   
1  common_voice_en_39586629.mp3   
2  common_voice_en_39587000.mp3   
3  common_voice_en_39587700.mp3   
4  common_voice_en_39587747.mp3   

                                            sentence  \
0  The definitive treatment for Heyde's syndrome ...   
1  Piles of stones may be seen at the partings of...   
2  Items can also be mixed together to create dif...   
3  Yahia Nader is an Egyptian player born in the ...   
4  Norris Green Park is situated between Broad La...   

                                       transcription  
0  THE DEFINITE TREATMENT FOR HADE'S CINDRUM IS S...  
1  PILES OF STONES MAY BE SEEN AT THE PARTING OF ...  
2  ITEMS CAN BE MIXED TOGETHER TO CREATE DIFFEREN...  
3  YEHEANEDIER IS AN EGYPTIAN PLAYER BORN IN THE ...  
4  NORRAS GREEN PARK IS LOCATED BETWEEN BROAD LAN...  


In [6]:
!pip install simalign
import simalign

Collecting simalign
  Downloading simalign-0.4-py3-none-any.whl (8.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->simalign)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->simalign)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->simalign)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch->simalign)
  Downloading nvi

In [7]:
from simalign import SentenceAligner
import pandas as pd

# Assuming your DataFrame is named df and has columns 'sentence' and 'transcription'
# Load your DataFrame here if it's not already loaded
# df = pd.read_csv('path_to_your_file.csv')

# Initialize the SentenceAligner
aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="i")

# Prepare lists to store aligned sentences
aligned_sentences = []

# Function to preprocess a sentence
def preprocess_sentence(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)
    # Split the sentence into words
    return sentence.split()

def calculate_unalignment_ratio(alignment_pairs, sentence_length, transcription_length):
    # Extracting indices from the pairs
    first_indices = {pair[0] for pair in alignment_pairs}
    second_indices = {pair[1] for pair in alignment_pairs}
    print(first_indices)
    print(second_indices)
    # Counting unique unaligned indices based on the ranges of sentence and transcription lengths
    unaligned_first = sentence_length - len(first_indices)
    unaligned_second = transcription_length - len(second_indices)

    # Total unaligned count
    total_unaligned = unaligned_first + unaligned_second
    print(unaligned_first)
    print(unaligned_second)
    # Dividing by the transcription length to find the ratio
    unalignment_ratio = (total_unaligned / sentence_length) * 100

    return unalignment_ratio



# Preprocess sentences in both columns
final_data_frame['sentence'] = final_data_frame['sentence'].apply(preprocess_sentence)
final_data_frame['transcription'] = final_data_frame['transcription'].apply(preprocess_sentence)


# Iterate over the rows of the DataFrame
for idx, row in final_data_frame.iterrows():
    # Get the sentences to align
    sentence_1 = row['sentence']
    sentence_2 = row['transcription']
    # Perform alignment
    alignments = aligner.get_word_aligns(sentence_1, sentence_2)
    print(alignments['itermax'])
    # Calculate the ratio or percentage
    unalignment_ratio = calculate_unalignment_ratio(alignments['itermax'], len(sentence_1), len(sentence_2))
    print(f"Unalignment Ratio: {unalignment_ratio:.2f}%")
    #aligned_pairs = alignments.get('intersect')
    aligned_sentences.append((sentence_1, sentence_2, alignments,unalignment_ratio))
aligned_df = pd.DataFrame(aligned_sentences, columns=['Sentence 1', 'Sentence 2', 'Alignments','unalignment_ratio'])
print(aligned_df.head())
aligned_df.to_csv('aligned_sentences_eng_3000.tsv', index=False)
print("Alignment complete. Aligned sentences saved to 'aligned_sentences.tsv'.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

2024-03-28 03:41:19,072 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{0, 1, 2, 3, 4, 9, 10, 12, 13}
{0, 1, 2, 3, 4, 9, 10, 11, 12, 13}
5
4
Unalignment Ratio: 64.29%
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 13)]
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
0
0
Unalignment Ratio: 0.00%
[(0, 0), (1, 1), (2, 2), (3, 2), (4, 4), (5, 5), (6, 7), (7, 7), (9, 9)]
{0, 1, 2, 3, 4, 5, 6, 7, 9}
{0, 1, 2, 4, 5, 7, 9}
1
3
Unalignment Ratio: 40.00%
[(0, 0), (1, 1), (2, 3), (5, 5)]
{0, 1, 2, 5}
{0, 1, 3, 5}
2
2
Unalignment Ratio: 66.67%
[(0, 0), (1, 1), (4, 4), (5, 5), (7, 7), (8, 4), (10, 10)]
{0, 1, 4, 5, 7, 8, 10}
{0, 1, 4, 5, 7, 10}
4
5
Unalignment Ratio: 81.82%
[(0, 0), (1, 1), (4, 4), (5, 0), (7, 7), (8, 8), (9, 9), (10, 10)]
{0, 1, 4, 5, 7, 8, 9, 10}
{0, 1, 4, 7, 8, 9, 10}
3
4
Unalignment Ratio: 63.64%
[(0, 0), (1, 1), (2, 2), (3, 3), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)]
{0

In [14]:
#print(aligned_df['unalignment_ratio'])
print(f"The mean unligned ratio is: {aligned_df['unalignment_ratio'].mean():.2f}%" )

The mean unligned ratio is: 49.91%
