In [65]:
import numpy as np
import random
import torch
from transformers import *
from sklearn.mixture import GaussianMixture
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
random.seed(42)

cuda


In [77]:
DOMAINS = ['Automotive', 'Books', 'Music', 'Software', 'Baby']


def readfile(path):
    f = open(path, "r")
    lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip() != '']
    f.close()
    return lines


def process_data():
    dataset = {}
    for domain in DOMAINS:
        print(f"Processing '{domain}' tagged data ....")
        dataset[domain] = {}
        SENT_PATH = "domain_dataset/" + domain + ".txt"
        SENTIMENT_PATH = "domain_dataset/" + domain + "_labels.txt"
        dataset[domain]['sents'] = readfile(SENT_PATH)
        dataset[domain]['sentiment'] = readfile(SENTIMENT_PATH)
    return dataset
        

def get_embeddings(model_info, data):
    model_class, model_tokenizer, model_name = model_info
    embeddings = {}
    embeddings['sents'] = []
    embeddings['vectors'] = []
    embeddings['domain'] = []
    embeddings['sentiment'] = []
    tokenizer = model_tokenizer.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)
    model.to(device)
    count = 1
    for sent in data:
        embeddings['sents'].append(sent[0])
        embeddings['domain'].append(sent[1])
        embeddings['sentiment'].append(sent[2])
        input_encoding = torch.tensor([tokenizer.encode(sent[0], add_special_tokens=True, max_length=128, truncation=True)])
        input_encoding = input_encoding.to(device)
        with torch.no_grad():
            output = model(input_encoding)
            final_hidden_layer = output[0].squeeze(dim=0)    # get the last hidden layer
            final_hidden_layer = final_hidden_layer[:input_encoding.shape[1], :]
            final_vector = np.array(final_hidden_layer.mean(dim=0).cpu())
            embeddings['vectors'].append(final_vector)
        if count%2500 == 0:
            print(f"{count} sentences done.")
        count += 1
    embeddings['sents'] = np.array(embeddings['sents'])
    embeddings['vectors'] = np.array(embeddings['vectors'])
    embeddings['domain'] = np.array(embeddings['domain'])
    embeddings['sentiment'] = np.array(embeddings['sentiment'])
    return embeddings


def data_to_embeddings(dataset, model_info):
    sentences = []
    for domain in DOMAINS:
        data = dataset[domain]
        for sent, sentiment in zip(data['sents'], data['sentiment']):
            labelled_sent = (sent, domain, sentiment)
            sentences.append(labelled_sent)
    random.shuffle(sentences)
    embeddings = get_embeddings(model_info, sentences)
    return embeddings

In [79]:
#RUN THIS FOR ORIGINAL DATASET

dataset = process_data()
# model_info = (BertModel, BertTokenizer, 'bert-base-uncased')
# model_info = (RobertaModel, RobertaTokenizer, 'roberta-large')
model_info = (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt')
embeddings = data_to_embeddings(dataset, model_info)

Processing 'Automotive' tagged data ....
Processing 'Books' tagged data ....
Processing 'Music' tagged data ....
Processing 'Software' tagged data ....
Processing 'Baby' tagged data ....


Downloading:   0%|          | 0.00/816k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/458k [00:00<?, ?B/s]

loading file vocab.json from cache at /home1/tejomay/.cache/huggingface/hub/models--openai-gpt/snapshots/b3ab1942f7090e287d001cec22331dfc2764acf0/vocab.json
loading file merges.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--openai-gpt/snapshots/b3ab1942f7090e287d001cec22331dfc2764acf0/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


Downloading:   0%|          | 0.00/656 [00:00<?, ?B/s]

loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--openai-gpt/snapshots/b3ab1942f7090e287d001cec22331dfc2764acf0/config.json
Model config OpenAIGPTConfig {
  "_name_or_path": "openai-gpt",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.24.0",
  "vocab_size": 40478
}

ftfy or spacy is not installed using BERT BasicTokenizer instead of Spa

Downloading:   0%|          | 0.00/479M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /home1/tejomay/.cache/huggingface/hub/models--openai-gpt/snapshots/b3ab1942f7090e287d001cec22331dfc2764acf0/pytorch_model.bin
All model checkpoint weights were used when initializing OpenAIGPTModel.

All the weights of OpenAIGPTModel were initialized from the model checkpoint at openai-gpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use OpenAIGPTModel for predictions without further training.


2500 sentences done.
5000 sentences done.
7500 sentences done.
10000 sentences done.
12500 sentences done.
15000 sentences done.
17500 sentences done.
20000 sentences done.
22500 sentences done.
25000 sentences done.
27500 sentences done.
30000 sentences done.
32500 sentences done.
35000 sentences done.
37500 sentences done.
40000 sentences done.
42500 sentences done.
45000 sentences done.
47500 sentences done.
50000 sentences done.


In [80]:
np.save('openaiGPT/sentences.npy', embeddings['sents'])
np.save('openaiGPT/vectors.npy', embeddings['vectors'])
np.save('openaiGPT/domain.npy', embeddings['domain'])
np.save('openaiGPT/sentiment.npy', embeddings['sentiment'])

In [64]:
for domain in DOMAINS:
    print(domain, len(dataset[domain]['sents']))

Automotive 10000
Books 10000
Music 10000
Software 10000
Baby 10000
