In [1]:
import numpy as np
import random
import torch
from transformers import *
from sklearn.mixture import GaussianMixture
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
random.seed(42)

You are using torchaudio==0.9.0, but torchaudio>=0.10.0 is required to use MCTCTFeatureExtractor. This requires torch>=1.10.0. Please upgrade torch and torchaudio.
You are using torch==1.9.0+cu111, but torch>=1.10.0 is required to use ViltModel. Please upgrade torch.


cuda


In [2]:
LABELS = ['it', 'koran', 'law', 'medical', 'subtitles']


def get_sentences(path, sent_set='train', lang='en'):
    print(f"Preparing {sent_set} data ....")
    filepath = path + sent_set + '.' + lang
    f = open(filepath, "r")
    lines = f.readlines()
    lines = [line.strip() for line in lines]
    f.close()
    return lines


def process_data(lang='en'):
    dataset = {}
    for label in LABELS:
        print(f"Processing '{label}' tagged data ....")
        dataset[label] = {}
        PATH = "multi_domain_new_split/" + label + "/"
        dataset[label]['train'] = get_sentences(PATH, 'train', lang)
        dataset[label]['test'] = get_sentences(PATH, 'test', lang)
        dataset[label]['dev'] = get_sentences(PATH, 'dev', lang)
    return dataset
        

def get_embeddings(model_info, data):
    model_class, model_tokenizer, model_name = model_info
    embeddings = {}
    embeddings['sents'] = []
    embeddings['vectors'] = []
    embeddings['labels'] = []
    tokenizer = model_tokenizer.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)
    model.to(device)
    count = 1
    for sent in data:
        embeddings['sents'].append(sent[0])
        embeddings['labels'].append(sent[1])
        input_encoding = torch.tensor([tokenizer.encode(sent[0], add_special_tokens=True, max_length=128, truncation=True)])
        input_encoding = input_encoding.to(device)
        with torch.no_grad():
            output = model(input_encoding)
            final_hidden_layer = output[0].squeeze(dim=0)    # get the last hidden layer
            final_hidden_layer = final_hidden_layer[:input_encoding.shape[1], :]
            final_vector = np.array(final_hidden_layer.mean(dim=0).cpu())
            embeddings['vectors'].append(final_vector)
        if count%10000 == 0:
            print(f"{count} sentences done.")
        count += 1
    
    embeddings['sents'] = np.array(embeddings['sents'])
    embeddings['vectors'] = np.array(embeddings['vectors'])
    embeddings['labels'] = np.array(embeddings['labels'])
    return embeddings


def data_to_embeddings(dataset, model_info, split='train'):
    sentences = []
    for label in LABELS:
        data = dataset[label][split]
        if len(data) > 150000:
            data = dataset[label][split][:150000]
        
        for sent in data:
            labelled_sent = (sent, label)
            sentences.append(labelled_sent)
    random.shuffle(sentences)
    embeddings = get_embeddings(model_info, sentences)
    return embeddings

In [3]:
#RUN THIS FOR ORIGINAL DATASET

dataset = process_data(lang='en')
model_info = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
train_embeddings = data_to_embeddings(dataset, model_info, split='train')
dev_embeddings = data_to_embeddings(dataset, model_info, split='dev')
test_embeddings = data_to_embeddings(dataset, model_info, split='test')
np.save('train/sents.npy', train_embeddings['sents'])
np.save('train/vectors.npy', train_embeddings['vectors'])
np.save('train/labels.npy', train_embeddings['labels'])
np.save('dev/sents.npy', dev_embeddings['sents'])
np.save('dev/vectors.npy', dev_embeddings['vectors'])
np.save('dev/labels.npy', dev_embeddings['labels'])
np.save('test/sents.npy', test_embeddings['sents'])
np.save('test/vectors.npy', test_embeddings['vectors'])
np.save('test/labels.npy', test_embeddings['labels'])

Processing 'it' tagged data ....
Preparing train data ....
Preparing test data ....
Preparing dev data ....
Processing 'koran' tagged data ....
Preparing train data ....
Preparing test data ....
Preparing dev data ....
Processing 'law' tagged data ....
Preparing train data ....
Preparing test data ....
Preparing dev data ....
Processing 'medical' tagged data ....
Preparing train data ....
Preparing test data ....
Preparing dev data ....
Processing 'subtitles' tagged data ....
Preparing train data ....
Preparing test data ....
Preparing dev data ....


loading file vocab.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/tokenizer_config.json
loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,


10000 sentences done.
20000 sentences done.
30000 sentences done.
40000 sentences done.
50000 sentences done.
60000 sentences done.
70000 sentences done.
80000 sentences done.
90000 sentences done.
100000 sentences done.
110000 sentences done.
120000 sentences done.
130000 sentences done.
140000 sentences done.
150000 sentences done.
160000 sentences done.
170000 sentences done.
180000 sentences done.
190000 sentences done.
200000 sentences done.
210000 sentences done.
220000 sentences done.
230000 sentences done.
240000 sentences done.
250000 sentences done.
260000 sentences done.
270000 sentences done.
280000 sentences done.
290000 sentences done.
300000 sentences done.
310000 sentences done.
320000 sentences done.
330000 sentences done.
340000 sentences done.
350000 sentences done.
360000 sentences done.
370000 sentences done.
380000 sentences done.
390000 sentences done.
400000 sentences done.
410000 sentences done.
420000 sentences done.
430000 sentences done.
440000 sentences don

loading file vocab.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/tokenizer_config.json
loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,


10000 sentences done.


loading file vocab.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/tokenizer_config.json
loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,


10000 sentences done.


In [None]:
#RUN THIS FOR CFILT ENGLSIH HINDI DATA

# from datasets import load_dataset
# dataset = load_dataset("cfilt/iitb-english-hindi")
# model_info = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
# # print(len(dataset['train']['translation']))
# embeddings = data_to_embeddings(dataset, model_info, split='train')