In [None]:
from datasets import load_dataset
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn import linear_model
import csv

In [None]:
def bertweet_preprocessing(data, file_path):

    bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

    train_data, dev_data=data['train'].shuffle(seed=0)[:20000], data['train'].shuffle(seed=0)[20000:]
    test_data=data['test']

    # tokenize and encode data
    encoded_inputs_train = tokenizer(train_data['text'], padding=True, truncation=True, return_tensors="pt")
    encoded_inputs_dev = tokenizer(dev_data['text'], padding=True, truncation=True, return_tensors="pt")
    encoded_inputs_test = tokenizer(test_data['text'], padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        features_train = bertweet(**encoded_inputs_train)
    # print(features.last_hidden_state.shape, features.last_hidden_state[:, 0, :].shape)

    with torch.no_grad():
        features_dev = bertweet(**encoded_inputs_dev)

    with torch.no_grad():
        features_test = bertweet(**encoded_inputs_test)

    # take cls token
    cls_token_train=features_train.last_hidden_state[:, 0, :].numpy()
    cls_token_dev = features_dev.last_hidden_state[:, 0, :].numpy()
    cls_token_test = features_test.last_hidden_state[:, 0, :].numpy()

    # write cls token to csv file
    with open(file_path+'cls_embedding_train.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(cls_token_train)

    with open(file_path+'cls_embedding_dev.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(cls_token_dev)

    with open(file_path+'cls_embedding_test.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(cls_token_test)

    # cal avg token embeddings
    token_length_train = torch.count_nonzero(encoded_inputs_train.attention_mask, dim=1)
    token_length_dev = torch.count_nonzero(encoded_inputs_dev.attention_mask, dim=1)
    token_length_test = torch.count_nonzero(encoded_inputs_test.attention_mask, dim=1)

    avg_embedding_train, avg_embedding_dev, avg_embedding_test = [],[],[]

    for i, doc_vecs in enumerate(features_train.last_hidden_state):
        end = token_length_train[i]
        avg_embedding_train.append(torch.mean(doc_vecs[1:end], 0).numpy())

    for i, doc_vecs in enumerate(features_dev.last_hidden_state):
        end = token_length_dev[i]
        avg_embedding_dev.append(torch.mean(doc_vecs[1:end], 0).numpy())

    for i, doc_vecs in enumerate(features_test.last_hidden_state):
        end = token_length_test[i]
        avg_embedding_test.append(torch.mean(doc_vecs[1:end], 0).numpy())

    # write avg embedding to csv file
    with open(file_path+'avg_embedding_train.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(avg_embedding_train)

    with open(file_path+'avg_embedding_dev.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(avg_embedding_dev)

    with open(file_path+'avg_embedding_test.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerows(avg_embedding_test)

    # write true labels to csv file
    with open(file_path + 'gold_train.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerow(train_data['label'])

    with open(file_path + 'gold_dev.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerow(dev_data['label'])

    with open(file_path + 'gold_test.csv', 'w') as f:
        wr = csv.writer(f)
        wr.writerow(test_data['label'])


    # test on logistic regression
    print('cls_embedding')
    lgr = linear_model.LogisticRegression()
    lgr.fit(np.array(cls_token_train), np.array(train_data['label']))
    print(lgr.score(cls_token_train, np.array(train_data['label'])))
    print(lgr.score(cls_token_dev, np.array(dev_data['label'])))
    print(lgr.score(cls_token_test, np.array(test_data['label'])))
    # # 0.8539  0.844   0.8432

    print('avg_embedding')
    lgr = linear_model.LogisticRegression()
    lgr.fit(np.array(avg_embedding_train), np.array(train_data['label']))
    print(lgr.score(avg_embedding_train, np.array(train_data['label'])))
    print(lgr.score(avg_embedding_dev, np.array(dev_data['label'])))
    print(lgr.score(avg_embedding_test, np.array(test_data['label'])))
    # # 0.8603  0.8514  0.85348

In [None]:
def pos_neg_vocab_preprocessing(data):
    train_data, dev_data=data['train'].shuffle(seed=0)[:20000], data['train'].shuffle(seed=0)[20000:]
    test_data=data['test']

    vocab=load_vocab('./imdb.vocab')
    vocab_senti_value=load_vocab_senti_value('./imdbEr.txt')

    # extract the idx of pos and neg token
    thres=1
    vocab_pos_neg_idx=[i for i, v in enumerate(vocab) if v not in stops and abs(vocab_senti_value[i])>thres]

    # keep only the pos and neg token from vocab:
    vocab=[t for i,t in enumerate(vocab) if i in vocab_pos_neg_idx]
    v=set(vocab)
    vocab_senti=np.array([senti for i,senti in enumerate(vocab_senti_value) if i in vocab_pos_neg_idx]).reshape(1,-1)
    # print(len(vocab))  # 18119 thres=1.0

    # # build imdb vocabulary dict:
    t2id={t:i for i,t in enumerate(vocab)}
    id2t={i:t for i,t in enumerate(vocab)}

    train_doc_vectors=np.zeros((len(train_data['text']),len(vocab)))
    dev_doc_vectors = np.zeros((len(dev_data['text']), len(vocab)))
    test_doc_vectors = np.zeros((len(test_data['text']), len(vocab)))

    for i,doc in enumerate(train_data['text']):
        # remove <br /> or 's or '-'>1 and lower()
        doc=re.sub(r"<br />|\'s|--+", " ", doc).lower()

        # extract all possible tokens
        tokens=tokenize(doc)
        # remove tokens not in vocab and stopwords
        tokens = [token for token in tokens if token in v and token not in stops]

        for token in tokens:
            train_doc_vectors[i, t2id[token]]+=1
    train_doc_vectors=train_doc_vectors*vocab_senti

    for i, doc in enumerate(dev_data['text']):
        # remove <br /> or 's or '-'>1 and lower()
        doc = re.sub(r"<br />|\'s|--+", " ", doc).lower()

        # extract all possible tokens
        tokens = tokenize(doc)
        # remove tokens not in vocab and stopwords
        tokens = [token for token in tokens if token in v and token not in stops]

        # update doc_vectors
        for token in tokens:
            dev_doc_vectors[i, t2id[token]] += 1
    dev_doc_vectors = dev_doc_vectors * vocab_senti

    for i, doc in enumerate(test_data['text']):
        # remove <br /> or 's or '-'>1 and lower()
        doc = re.sub(r"<br />|\'s|--+", " ", doc).lower()
        # extract all possible tokens
        tokens = tokenize(doc)
        # remove tokens not in vocab and stopwords
        tokens = [token for token in tokens if token in v and token not in stops]

        # update doc_vectors
        for token in tokens:
            test_doc_vectors[i, t2id[token]] += 1
    test_doc_vectors = test_doc_vectors * vocab_senti

    lgr = linear_model.LogisticRegression()
    lgr.fit(train_doc_vectors, np.array(train_data['label']))
    # print(lgr.score(train_doc_vectors, np.array(train_data['label'])))
    # print(lgr.score(dev_doc_vectors, np.array(dev_data['label'])))
    # print(lgr.score(test_doc_vectors, np.array(test_data['label'])))
    # # 0.94965 0.8648  0.85156

In [None]:
dataset = load_dataset('imdb')
bertweet_preprocessing(dataset, './data/')
pos_neg_vocab_preprocessing(data)

instructions:

above is the code for extracting the embeddings for the documents. those embeddings are already saved in './data'. And you don't need to regenerate it again (it's time consuming).

you can read the csv file directly to get the embeddings and train it for the models. (I also run the logistic regression and the results are shown above, you can try it to make sure the embedding is good.)

The model you can try with this embedding is logistic regression, NN(recommend this), random forest/decision tree and ensemble models. This already embedded one cannot run Naive Bayes.

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[?25l[K     |█                               | 10 kB 18.3 MB/s eta 0:00:01[K     |██▏                             | 20 kB 20.9 MB/s eta 0:00:01[K     |███▎                            | 30 kB 23.1 MB/s eta 0:00:01[K     |████▍                           | 40 kB 23.6 MB/s eta 0:00:01[K     |█████▌                          | 51 kB 22.9 MB/s eta 0:00:01[K     |██████▋                         | 61 kB 23.3 MB/s eta 0:00:01[K     |███████▊                        | 71 kB 21.4 MB/s eta 0:00:01[K     |████████▉                       | 81 kB 22.3 MB/s eta 0:00:01[K     |█████████▉                      | 92 kB 22.7 MB/s eta 0:00:01[K     |███████████                     | 102 kB 24.1 MB/s eta 0:00:01[K     |████████████                    | 112 kB 24.1 MB/s eta 0:00:01[K     |█████████████▏                  | 122 kB 24.1 MB/s eta 0:00:01[K     |██████████████▎                 | 133 kB 24.1 MB/s et

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
dataset = load_dataset('imdb')

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
df=pd.DataFrame(dataset['train']).replace({0: 'neg', 1: 'pos'})
pd.set_option('display.max_colwidth', 1000)
df.style.set_properties(**{'text-align': 'left'})
df.head(3)

Unnamed: 0,text,label
0,"I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered ""controversial"" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between,...",neg
1,"""I Am Curious: Yellow"" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) ""double-standard"" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same ...",neg
2,"If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />",neg
