In [24]:
import os
import joblib
import numpy as np
import torch
from datasets import load_dataset
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from transformers import AutoModel, DistilBertTokenizer

In [25]:
# YELP Dataset
yelp = load_dataset("yelp_review_full")

# create dataset
train_ds = yelp["train"].select(range(1000))

In [26]:
# Model and Tokenizer
model_name = "distilbert-base-uncased"
# Check if GPU is available and use it, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [27]:
# Tokenizer
text = "Hello, this is a sample sentence!"
encoded_text = tokenizer(text, return_tensors="pt").to(device)

# Tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text["input_ids"][0])

In [28]:
tokenizer.convert_tokens_to_string(tokens)

'[CLS] hello , this is a sample sentence ! [SEP]'

In [29]:
# how large is the vocabulary?
tokenizer.vocab_size

30522

In [30]:
# Max context length
max_context_length = tokenizer.model_max_length
max_context_length

512

In [31]:
# Function for tokenization
def tokenize_text(batch):
    # padding...texts are filled with zeros based to longest example
    # truncation...texts are cut off after max_context_length
    return tokenizer(
        batch["text"], return_tensors="pt", padding="max_length", truncation=True
    )

In [32]:
yelp_encodings = train_ds.map(tokenize_text, batched=True, batch_size=128)

yelp_encodings.set_format(
    "torch", columns=["input_ids", "attention_mask", "label"]
)  # encodings need to be converted to torch tensors

In [33]:
def get_last_hidden_state(batch):
    inputs = {
        k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names
    }
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        # [:, 0] refers to CLS token for complete sentence representation
    return {"hidden_state": last_hidden_state[:, 0].cpu()}

In [34]:
yelp_hidden_states = yelp_encodings.map(
    get_last_hidden_state, batched=True, batch_size=128
)  # will have additional column 'hidden_state'

yelp_hidden_states

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 1000
})

In [35]:
os.makedirs("model", exist_ok=True)
joblib.dump(yelp_hidden_states, "model/yelp_hidden_states.joblib")

['model/yelp_hidden_states.joblib']

In [36]:
cutoff = 800
X_train = np.array(yelp_hidden_states["hidden_state"][:cutoff])
y_train = np.array(yelp_hidden_states["label"][:cutoff])
X_test = np.array(yelp_hidden_states["hidden_state"][cutoff:])
y_test = np.array(yelp_hidden_states["label"][cutoff:])
print(f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}")
print(f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")

X_train.shape: (800, 768), y_train.shape: (800,)
X_test.shape: (200, 768), y_test.shape: (200,)


In [37]:
# Dummy model
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)
dummy_model.score(X_test, y_test)

0.165

In [38]:
# SVM model
svm_model = SVC(kernel="linear", C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.395

In [39]:
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.43

In [40]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)
# %%

0.33