In [4]:
import numpy as np
import random
import torch
from transformers import *
from sklearn.mixture import GaussianMixture
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
random.seed(42)

cuda


In [5]:
def get_embeddings(model_info, data):
    model_class, model_tokenizer, model_name = model_info
    embeddings = {}
    embeddings['sents'] = []
    embeddings['vectors'] = []
    tokenizer = model_tokenizer.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)
    model.to(device)
    count = 1
    for sent in data:
        embeddings['sents'].append(sent)
        input_encoding = torch.tensor([tokenizer.encode(sent, add_special_tokens=True, max_length=128, truncation=True)])
        input_encoding = input_encoding.to(device)
        # print(input_encoding.shape)
        with torch.no_grad():
            output = model(input_encoding)
            final_hidden_layer = output[0].squeeze(dim=0)    # get the last hidden layer
            final_hidden_layer = final_hidden_layer[:input_encoding.shape[1], :]
            final_vector = np.array(final_hidden_layer.mean(dim=0).cpu())
            # print(final_vector.shape)
            embeddings['vectors'].append(final_vector)
        if count%10000 == 0:
            print(f"{count} sentences done.")
        count += 1
    
    embeddings['sents'] = np.array(embeddings['sents'])
    embeddings['vectors'] = np.array(embeddings['vectors'])
    return embeddings


def data_to_embeddings(dataset, model_info, split='train', lang='hi'):
    sentences = []
    data = dataset[split]['translation']
    random.shuffle(data)
    data = data[:400000]
    for parallel_sent in data:
        sent = parallel_sent[lang]
        sentences.append(sent)

    embeddings = get_embeddings(model_info, sentences)
    return embeddings

In [None]:
sentences = ["I saw a boy with a telescope", "I like this music"]
model_info = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
embeddings = get_embeddings(model_info, sentences)

In [6]:
from datasets import load_dataset
dataset = load_dataset("cfilt/iitb-english-hindi")
model_info = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
# print(len(dataset['train']['translation']))
embeddings = data_to_embeddings(dataset, model_info, split='train')

Using custom data configuration cfilt--iitb-english-hindi-911387c6837f8b91
Found cached dataset parquet (C:/Users/tejom/.cache/huggingface/datasets/cfilt___parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

loading file vocab.txt from cache at C:\Users\tejom/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\tejom/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\tejom/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,


10000 sentences done.
20000 sentences done.
30000 sentences done.
40000 sentences done.
50000 sentences done.
60000 sentences done.
70000 sentences done.
80000 sentences done.
90000 sentences done.
100000 sentences done.
110000 sentences done.
120000 sentences done.
130000 sentences done.
140000 sentences done.
150000 sentences done.
160000 sentences done.
170000 sentences done.
180000 sentences done.
190000 sentences done.
200000 sentences done.
210000 sentences done.
220000 sentences done.
230000 sentences done.
240000 sentences done.
250000 sentences done.
260000 sentences done.
270000 sentences done.
280000 sentences done.
290000 sentences done.
300000 sentences done.
310000 sentences done.
320000 sentences done.
330000 sentences done.
340000 sentences done.
350000 sentences done.
360000 sentences done.
370000 sentences done.
380000 sentences done.
390000 sentences done.
400000 sentences done.


MemoryError: Unable to allocate 10.8 GiB for an array with shape (400000,) and data type <U7216

In [None]:
np.save('sentences_hi.npy', embeddings['sents'])
np.save('vectors_hi.npy', embeddings['vectors'])
print(len(embeddings['sents']))

In [None]:
def GMM(data, num_clusters):
    model = GaussianMixture(
        n_components=num_clusters, 
        covariance_type='full', 
        max_iter=100, 
        random_state=0)

    model.fit(data)
    predictions = model.predict(data)
    