In [56]:
import numpy as np
import random
import torch
from transformers import *
from sklearn.mixture import GaussianMixture
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
random.seed(42)

cuda


In [57]:
DOMAINS = ['Automotive', 'Books', 'Music', 'Software', 'Baby']


def readfile(path):
    f = open(path, "r")
    lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip() != '']
    f.close()
    return lines


def process_data():
    dataset = {}
    for domain in DOMAINS:
        print(f"Processing '{domain}' tagged data ....")
        dataset[domain] = {}
        SENT_PATH = "domain_dataset/" + domain + ".txt"
        SENTIMENT_PATH = "domain_dataset/" + domain + "_labels.txt"
        dataset[domain]['sents'] = readfile(SENT_PATH)
        dataset[domain]['sentiment'] = readfile(SENTIMENT_PATH)
    return dataset
        

def get_embeddings(model_info, data):
    model_class, model_tokenizer, model_name = model_info
    embeddings = {}
    embeddings['sents'] = []
    embeddings['vectors'] = []
    embeddings['domain'] = []
    embeddings['sentiment'] = []
    tokenizer = model_tokenizer.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)
    model.to(device)
    count = 1
    for sent in data:
        embeddings['sents'].append(sent[0])
        embeddings['domain'].append(sent[1])
        embeddings['sentiment'].append(sent[2])
        input_encoding = torch.tensor([tokenizer.encode(sent[0], add_special_tokens=True, max_length=128, truncation=True)])
        input_encoding = input_encoding.to(device)
        with torch.no_grad():
            output = model(input_encoding)
            final_hidden_layer = output[0].squeeze(dim=0)    # get the last hidden layer
            final_hidden_layer = final_hidden_layer[:input_encoding.shape[1], :]
            final_vector = np.array(final_hidden_layer.mean(dim=0).cpu())
            embeddings['vectors'].append(final_vector)
        if count%2500 == 0:
            print(f"{count} sentences done.")
        count += 1
    embeddings['sents'] = np.array(embeddings['sents'])
    embeddings['vectors'] = np.array(embeddings['vectors'])
    embeddings['domain'] = np.array(embeddings['domain'])
    embeddings['sentiment'] = np.array(embeddings['sentiment'])
    return embeddings


def data_to_embeddings(dataset, model_info):
    sentences = []
    for domain in DOMAINS:
        data = dataset[domain]
        for sent, sentiment in zip(data['sents'], data['sentiment']):
            labelled_sent = (sent, domain, sentiment)
            sentences.append(labelled_sent)
    random.shuffle(sentences)
    embeddings = get_embeddings(model_info, sentences)
    return embeddings

In [58]:
#RUN THIS FOR ORIGINAL DATASET

dataset = process_data()
model_info = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
embeddings = data_to_embeddings(dataset, model_info)

Processing 'Automotive' tagged data ....
Processing 'Books' tagged data ....
Processing 'Music' tagged data ....
Processing 'Software' tagged data ....
Processing 'Baby' tagged data ....


loading file vocab.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/tokenizer_config.json
loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,


2500 sentences done.
5000 sentences done.
7500 sentences done.
10000 sentences done.
12500 sentences done.
15000 sentences done.
17500 sentences done.
20000 sentences done.
22500 sentences done.
25000 sentences done.
27500 sentences done.
30000 sentences done.
32500 sentences done.
35000 sentences done.
37500 sentences done.
40000 sentences done.
42500 sentences done.
45000 sentences done.
47500 sentences done.
50000 sentences done.


In [59]:
np.save('sentences.npy', embeddings['sents'])
np.save('vectors.npy', embeddings['vectors'])
np.save('domain.npy', embeddings['domain'])
np.save('sentiment.npy', embeddings['sentiment'])

In [51]:
for domain in DOMAINS:
    print(domain, len(dataset[domain]['sents']))

Automotive 10000
Books 10000
Music 10000
Software 10000
Sports 10000
