In [6]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch
import heapq

device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=6)

In [14]:
def cosine_sim(e1, e2):
    e1 = torch.tensor(e1).to(device)
    e2 = torch.tensor(e2).to(device)
    return (e1@e2).item()


def filter_data(dataset, lang1='en', lang2='hi'):
    new_data = []
    symbols = ['<', '>', '%', '\\s']
    model = SentenceTransformer('sentence-transformers/LaBSE').to(device)
    
    heapq.heapify(new_data)
    
    for i, pair in enumerate(dataset):
        flag = False
        for s in symbols:
            if s in pair['en']:
                flag = True
                break
        if not flag and len(pair['en']) > 0 and len(pair['hi']) > 0:
            embeds = model.encode([pair['en'], pair['hi']])
            sim = cosine_sim(embeds[0], embeds[1])
            if sim < 0.95:
                heapq.heappush(new_data, (sim, {'en': pair['en'], 'hi': pair['hi']}))
        if (i+1) % 50000 == 0:
            print(f'{i+1} sentences processed.')
    print("Processing done.")
    return new_data

def write_data(data, PATH):
    f_en = open(PATH + 'filtered.en', 'w')
    f_hi = open(PATH + 'filtered.hi', 'w')
    
    for pair in data:
        f_en.write(pair['en'] + '\n')
        f_hi.write(pair['hi'] + '\n')
    
    f_en.close()
    f_hi.close()

In [None]:
dataset = load_dataset("cfilt/iitb-english-hindi")['train']['translation']
dataset = filter_data(dataset)

Found cached dataset parquet (/root/.cache/huggingface/datasets/cfilt___parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

50000 sentences processed.
100000 sentences processed.
150000 sentences processed.
200000 sentences processed.
250000 sentences processed.
300000 sentences processed.


In [13]:
dataset

[(0.8373191952705383,
  {'en': 'Accerciser Accessibility Explorer',
   'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}),
 (0.8552480936050415,
  {'en': 'Give your application an accessibility workout',
   'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'})]