In [1]:
import numpy as np
import torch
from transformers import *
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import time
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
np.random.seed(42)

You are using torchaudio==0.9.0, but torchaudio>=0.10.0 is required to use MCTCTFeatureExtractor. This requires torch>=1.10.0. Please upgrade torch and torchaudio.
You are using torch==1.9.0+cu111, but torch>=1.10.0 is required to use ViltModel. Please upgrade torch.


cuda


In [2]:
embeddings = {}
embeddings['sents'] = np.load('openaiGPT/sentences.npy')
embeddings['vectors'] = np.load('openaiGPT/vectors.npy')
embeddings['domain'] = np.load('openaiGPT/domain.npy')
embeddings['sentiment'] = np.load('openaiGPT/sentiment.npy')
DOMAINS = ['Automotive', 'Books', 'Music', 'Software', 'Baby']

def create_known_sample(embeddings):
    known_sample = {}
    known_sample['sents'] = []
    known_sample['vectors'] = []
    known_sample['domain'] = []
    known_sample['sentiment'] = []
    count = 0
    for i, domain in enumerate(embeddings['domain']):
        count += 1
        known_sample['sents'].append(embeddings['sents'][i])
        known_sample['vectors'].append(embeddings['vectors'][i])
        known_sample['domain'].append(embeddings['domain'][i])
        known_sample['sentiment'].append(embeddings['sentiment'][i])
        if count >= 4000:
            break
    embeddings['sents'] = embeddings['sents'][4000:]
    embeddings['vectors'] = embeddings['vectors'][4000:]
    embeddings['domain'] = embeddings['domain'][4000:]
    embeddings['sentiment'] = embeddings['sentiment'][4000:]
    
    return known_sample, embeddings

In [3]:
def domain_cosine_sort(embeddings, known_sample, domain='Music'):
    count = 0
    sorted_embeddings = {}
    known_vector = np.zeros((embeddings['vectors'].shape[1], ))
    for i, vector in enumerate(known_sample['vectors']):
        if known_sample['domain'][i] == domain:
            known_vector += vector
            count += 1
    known_vector /= count
    
    count = 1
    score = []
    for i, vector in enumerate(embeddings['vectors']):
        cosine_score = cosine_similarity(vector.reshape(1,-1), known_vector.reshape(1,-1))
#         print(cosine_score[0][0], end=" ")
        score.append(-1.0 * cosine_score[0][0])
#     print(score)
    score = np.array(score)
    sort_index = np.argsort(score)
    sorted_score = [-1.0 * score[i] for i in sort_index]
#     print(sorted_score)
    sorted_embeddings['sents'] = [embeddings['sents'][i] for i in sort_index]
    sorted_embeddings['vectors'] = [embeddings['vectors'][i] for i in sort_index]
    sorted_embeddings['domain'] = [embeddings['domain'][i] for i in sort_index]
    sorted_embeddings['sentiment'] = [embeddings['sentiment'][i] for i in sort_index]
    sorted_embeddings['score'] = sorted_score
    return sorted_embeddings

def sampling(sorted_embeddings, neg_sampling=False):
    dataset = {}
    dataset['sents'] = []
    dataset['vectors'] = []
    dataset['domain'] = []
    dataset['sentiment'] = []
    dataset['labels'] = []
    for i in range(5000):
        dataset['sents'].append(sorted_embeddings['sents'][i])
        dataset['vectors'].append(sorted_embeddings['vectors'][i])
        dataset['domain'].append(sorted_embeddings['domain'][i])
        dataset['sentiment'].append(sorted_embeddings['sentiment'][i])
        dataset['labels'].append(1)
    
    if not neg_sampling:
        random_index = np.arange(5000, len(sorted_embeddings['sents']), 1)
        np.random.shuffle(random_index)
        random_index = random_index[:20000]
        for idx in random_index:
            dataset['sents'].append(sorted_embeddings['sents'][idx])
            dataset['vectors'].append(sorted_embeddings['vectors'][idx])
            dataset['domain'].append(sorted_embeddings['domain'][idx])
            dataset['sentiment'].append(sorted_embeddings['sentiment'][idx])
            dataset['labels'].append(0)
    
    else:
        for i in range(1, 40001):
            idx = -1 * i
            if sorted_embeddings['score'][idx] < 0.4:
                dataset['sents'].append(sorted_embeddings['sents'][idx])
                dataset['vectors'].append(sorted_embeddings['vectors'][idx])
                dataset['domain'].append(sorted_embeddings['domain'][idx])
                dataset['sentiment'].append(sorted_embeddings['sentiment'][idx])
                dataset['labels'].append(0)
    return dataset

In [4]:
def domain_P_R_F(dataset, domain):
    tp = sum([1 for i in range(len(dataset['labels'])) if (dataset['labels'][i] == 1 and dataset['domain'][i] == domain)])
    fp = sum([1 for i in range(len(dataset['labels'])) if (dataset['labels'][i] == 1 and dataset['domain'][i] != domain)])
    fn = sum([1 for i in range(len(dataset['labels'])) if (dataset['labels'][i] == 0 and dataset['domain'][i] == domain)])
#     print(tp, fp, fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = (2 * precision * recall) / (precision + recall)
    return precision, recall, F1

In [5]:
known_sample, embeddings = create_known_sample(embeddings)
for dom in DOMAINS:
    sorted_embeddings = domain_cosine_sort(embeddings, known_sample, domain=dom)
    dataset = sampling(sorted_embeddings, neg_sampling=False)
#     print(len(dataset['labels']))
    precision, recall, F1 = domain_P_R_F(dataset, dom)
    print(f'Domain: {dom} => Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {F1:.4f}')

Domain: Automotive => Precision: 0.6546 | Recall: 0.5304 | F1: 0.5860
Domain: Books => Precision: 0.8782 | Recall: 0.6565 | F1: 0.7513
Domain: Music => Precision: 0.9850 | Recall: 0.7060 | F1: 0.8225
Domain: Software => Precision: 0.8606 | Recall: 0.6355 | F1: 0.7311
Domain: Baby => Precision: 0.8192 | Recall: 0.6226 | F1: 0.7075


In [6]:
# DOMAINS = ['Automotive', 'Books', 'Music', 'Software', 'Baby']
for dom in DOMAINS:
    sorted_embeddings = domain_cosine_sort(embeddings, known_sample, domain=dom)
    dataset = sampling(sorted_embeddings, neg_sampling=True)
#     print(len(dataset['labels']))
    precision, recall, F1 = domain_P_R_F(dataset, dom)
    print(f'Domain: {dom} => Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {F1:.4f}')

Domain: Automotive => Precision: 0.6546 | Recall: 0.9951 | F1: 0.7897
Domain: Books => Precision: 0.8782 | Recall: 0.9941 | F1: 0.9326
Domain: Music => Precision: 0.9850 | Recall: 0.9972 | F1: 0.9910
Domain: Software => Precision: 0.8606 | Recall: 0.9981 | F1: 0.9243
Domain: Baby => Precision: 0.8192 | Recall: 0.9988 | F1: 0.9001


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
torch.cuda.empty_cache()

# class RNN(torch.nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super(RNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.input_2_hidden = torch.nn.Linear(input_size+hidden_size, hidden_size)
#         self.input_2_output = torch.nn.Linear(input_size+hidden_size, 1)
    
#     def forward(self, input_tensor, hidden_tensor):
#         concat = torch.cat((input_tensor, hidden_tensor), 1)
#         hidden = input_to_hidden(concat)
#         output = input_to_output(concat)
#         output = torch.sigmoid(output)
#         return hidden, output
    
#     def init_hidden_layer(self):
#         return torch.zeros(1, self.hidden_size)

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, sent, sentiment, tokenizer, max_len):
        self.sents = sent
        self.sentiments = sentiment
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sents)
    
    def __getitem__(self, index):
        sent = self.sents[index]
        sentiment = self.sentiments[index]
        encoding = self.tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'sent': sent,
            'sentiment': torch.tensor(sentiment, dtype=torch.float),
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [7]:
known_sample, embeddings = create_known_sample(embeddings)
sorted_embeddings = domain_cosine_sort(embeddings, known_sample, domain='Books')
dataset = sampling(sorted_embeddings, neg_sampling=True)
dataset['sentiment'] = [int(x) for x in dataset['sentiment']]

MODEL = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(MODEL)
# bert = BertModel.from_pretrained(MODEL)
MAX_LEN = 200
BATCH_SIZE = 16

loading file vocab.txt from cache at /home1/tejomay/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e

In [8]:
X = dataset['sents']
y = dataset['sentiment']

temp = list(zip(X, y))
random.shuffle(temp)
X, y = zip(*temp)
X, y = list(X), list(y)

split = int(0.8 * len(X))
X_train = np.array(X[:split])
X_test = np.array(X[split:])
y_train = np.array(y[:split])
y_test = np.array(y[split:])

split = int(0.85 * X_train.shape[0])
X_val = X_train[split:]
y_val = y_train[split:]
X_train = X_train[:split]
y_train = y_train[:split]

print(X_train.shape)
train_dataset = SentimentDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(X_val, y_val, tokenizer, MAX_LEN)
test_dataset = SentimentDataset(X_test, y_test, tokenizer, MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=4)
val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=4)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=4)

(3437,)


In [9]:
class SentimentAnalyser(torch.nn.Module):
    def __init__(self):
        super(SentimentAnalyser, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL)
        self.dropout = torch.nn.Dropout(p=0.35)
        self.output = torch.nn.Linear(self.bert.config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        out = self.dropout(bert_output.pooler_output)
        out = self.output(out)
        out = torch.sigmoid(out)
        return out

In [10]:
model = SentimentAnalyser()
model = model.to(device)

loading configuration file config.json from cache at /home1/tejomay/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home1/tejomay/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/pytorch_model.bin
Some weights of t

In [11]:
num_epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = torch.nn.BCELoss()

In [12]:
def eval_model(model, data_loader, loss_fn, n_examples):
    model = model.eval()

    losses = []
    correct = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d['sentiment']
            targets = torch.reshape(targets, (-1,1))
            targets = targets.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.round(outputs)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
            
            correct += torch.sum(preds == targets)

    return correct / n_examples, np.mean(losses)

def train(model, num_epochs, train_data_loader, val_data_loader, loss_fn, optimizer, scheduler, train_size, val_size):
    prev_val_acc = 0.0
    for e in range(num_epochs):
        model = model.train()
        losses = []
        correct = 0
        step = 1
        for data in train_data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            targets = data['sentiment']
            targets = torch.reshape(targets, (-1,1))
            targets = targets.to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.round(outputs)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
            
            correct += torch.sum(preds == targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if step%100 == 0:
                print(f'Step:{step} loss:{loss.item():.4f}')
            step += 1
        
        acc = correct / train_size
        mean_loss = np.mean(losses)
        
        val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, val_size)
#         val_acc_list.append(val_acc)
        
        print(f'Epoch:{e+1} Train acc:{acc:.4f} train loss:{mean_loss:.4f} || Val acc:{val_acc:.4f} prev_val_acc:{prev_val_acc:.4f}')
        if val_acc < prev_val_acc:
            break
        prev_val_acc = val_acc
    return model

In [13]:
model = train(model, num_epochs, train_data_loader, val_data_loader, loss_fn, optimizer, scheduler, len(train_dataset), len(val_dataset))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Step:100 loss:0.2856
Step:200 loss:0.2159


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch:1 Train acc:0.7867 train loss:0.4428 || Val acc:0.9094 prev_val_acc:0.0000


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Step:100 loss:0.0237
Step:200 loss:0.0105


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch:2 Train acc:0.9258 train loss:0.2220 || Val acc:0.9077 prev_val_acc:0.9094


In [14]:
acc, loss = eval_model(model, test_data_loader, loss_fn, len(test_dataset))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [15]:
print(acc, loss)

tensor(0.8864, device='cuda:0') 0.444679222702689
