In [17]:
from transformers import BertModel, BertTokenizer, logging
import torch
import numpy as np
import re
import glob
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from datasets import load_dataset
logging.set_verbosity_error()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [18]:
raw_datasets = load_dataset("imdb")

Reusing dataset imdb (/Users/alexdong/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [44]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def embed_texts(x):
    sentence_emb  = x['input_ids']
    #sentence_emb = torch.LongTensor(tokenizer.encode(text,padding = 'max_length', truncation = True))
    sentence_emb = torch.LongTensor(sentence_emb).to(device)
    with torch.no_grad():
        # embed the sentences
        print(sentence_emb.shape)
        out = model(sentence_emb.unsqueeze(0))
        hidden_states = out[2]
            
    # sum up last four layers for improved performance
    last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        
    # reshape the embedding to (1,768)
    sum_sentence_embedding = torch.mean(sum(last_four_layers), dim=1).squeeze()
    ret = np.array(sum_sentence_embedding.cpu())
    
    return {'embedding': ret}

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [20]:
X_train = raw_datasets['train'].map(tokenize_function, batched=True)

Loading cached processed dataset at /Users/alexdong/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-1e75eaf5a9f85e16.arrow


In [None]:
embed_texts(X_train[0])

In [48]:
pd.DataFrame(X_train)

Unnamed: 0,attention_mask,input_ids,label,text,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 12524, 1045, 2572, 8025, 1011, 375...",0,I rented I AM CURIOUS-YELLOW from my video sto...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...",0,"""I Am Curious: Yellow"" is a risible and preten...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2065, 2069, 2000, 4468, 2437, 2023, 2828...",0,If only to avoid making this type of film in t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2023, 2143, 2001, 2763, 4427, 2011, 2643...",0,This film was probably inspired by Godard's Ma...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2821, 1010, 2567, 1012, 1012, 1012, 2044...",0,"Oh, brother...after hearing about this ridicul...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
24995,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1037, 2718, 2012, 1996, 2051, 2021, 2085...",1,A hit at the time but now better categorised a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24996,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2293, 2023, 3185, 2066, 2053, 2060...",1,I love this movie like no other. Another time ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24997,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2023, 2143, 1998, 2009, 1005, 1055, 8297...",1,This film and it's sequel Barry Mckenzie holds...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24998,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1005, 1996, 7357, 1997, 6287, 18506, 100...",1,'The Adventures Of Barry McKenzie' started lif...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
