In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp39-cp39-win_amd64.whl (554 kB)
Collecting pyarrow!=4.0.0,>=3.0.0
  Downloading pyarrow-6.0.1-cp39-cp39-win_amd64.whl (15.5 MB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
Collecting xxhash
  Downloading xxhash-2.0.2-cp39-cp39-win_amd64.whl (35 kB)
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp39-cp39-win_amd64.whl (45 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp39-cp39-win_amd64.whl (122 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp39-cp39-win_amd64.whl (83 kB)
Installing collected packages: multidict, frozenlist, yarl, async-timeout, aiosignal, fsspec, aiohttp, xxhash, pyarrow, dataset

In [2]:
from transformers import BertModel, BertTokenizer, logging
import torch
import numpy as np
import re
import glob
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from datasets import load_dataset
logging.set_verbosity_error()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [3]:
raw_datasets = load_dataset("imdb")

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to C:\Users\Alex\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to C:\Users\Alex\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
model.to(device)
device

'cuda'

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def embed_texts(x):
    sentence_emb  = x['input_ids']
    #sentence_emb = torch.LongTensor(tokenizer.encode(text,padding = 'max_length', truncation = True))
    sentence_emb = torch.LongTensor(sentence_emb).to(device)
    with torch.no_grad():
        # embed the sentences
        #print(sentence_emb.shape)
        out = model(sentence_emb)
        hidden_states = out[2]

            
    # sum up last four layers for improved performance
    last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        
    # reshape the embedding to (1,768)
    sum_sentence_embedding = torch.mean(sum(last_four_layers), dim=1).squeeze()
    ret = np.array(sum_sentence_embedding.cpu())
    
    return {'embedding': ret}

def embed_tokens(examples):
  # load tokens on to gpu
    tokens = torch.LongTensor(examples['input_ids']).to(device)
    batch_size = tokens.size()[0]
    model.eval()
    with torch.no_grad():
        out = model(tokens)
        # contains hidden states of all batch_size tensors
        hidden_states = out[2]
    
    del(tokens, out)
    torch.cuda.empty_cache()
  
    ret = []
    for i in range(batch_size):
    # sum up last four layers for better performance according to BERT paper
        last_four_layers = [hidden_states[j][i].cpu() for j in (-1, -2, -3, -4)]
        sum_sentence_embedding = torch.mean(sum(last_four_layers), dim=0).squeeze()
        ret.append(np.array(sum_sentence_embedding))
  
    del(hidden_states)
    torch.cuda.empty_cache()
    return {'embedding' : ret}

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
X_train = raw_datasets['train'].map(tokenize_function, batched=True)
X_test = raw_datasets['test'].map(tokenize_function, batched = True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [None]:
X_train_emb = X_train.map(embed_tokens, batched = True, batch_size = 100)

  0%|          | 0/250 [00:00<?, ?ba/s]

In [None]:
X_test_emb = X_test.map(embed_tokens, batched = True, batch_size = 100)

  0%|          | 0/250 [00:00<?, ?ba/s]

In [None]:
# save the results from running embeddings on colab
import pickle
with open('/content/drive/MyDrive/MLfile/train_embedding.pkl', 'wb') as f:
  pickle.dump(np.array(X_train_emb['embedding']), f)

with open('/content/drive/MyDrive/MLfile/train_label.pkl', 'wb') as f:
  pickle.dump(np.array(X_train_emb['label']), f)

import pickle
with open('/content/drive/MyDrive/MLfile/test_embedding.pkl', 'wb') as f:
    pickle.dump(np.array(X_test_emb['embedding']), f)

with open('/content/drive/MyDrive/MLfile/test_label.pkl', 'wb') as f:
    pickle.dump(np.array(X_test_emb['label']), f)

In [33]:
import pickle
#path = '/content/drive/MyDrive/MLfile/'
path ='embedding/'
with open(path + 'train_embedding.pkl', 'rb') as f:
    X = pickle.load(f)

with open(path +'train_label.pkl', 'rb') as f:
    y = pickle.load(f)

print(y.shape,X.shape)
X.shape

(25000,) (25000, 768)


(25000, 768)

In [34]:
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

In [35]:
X_train = np.array(X)[0:20000]
X_train.shape
y_train = np.array(y)[0:20000]
y_train.shape

(20000,)

In [36]:
X_valid = np.array(X)[20000:]
y_valid = np.array(y)[20000:]
X_valid.shape

(5000, 768)

In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [37]:
y

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

tensor_x = torch.Tensor(X_train)
tensor_y = torch.Tensor(y_train)

train_dataset = TensorDataset(tensor_x,tensor_y)

tensor_x = torch.Tensor(X_valid)
tensor_y = torch.Tensor(y_valid)

valid_dataset = TensorDataset(tensor_x,tensor_y)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset= train_dataset, eval_dataset=valid_dataset
)

In [None]:
clf = LogisticRegression(random_state=0, max_iter = 100000, solver = 'saga', penalty = 'elasticnet', l1_ratio = 0.5, C = 10).fit(X_train, y_train)

In [41]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

SVC()

In [None]:
print('training error is', np.sum(abs(clf.predict(X_train) - y_train)) / len(y_train))

In [None]:
print('validation error is', np.sum(abs(clf.predict(X_valid) - y_valid)) / len(y_valid))