### Chapter 4 Text Classification Using both "generative model and representation models"

In [6]:
from datasets import load_dataset

In [7]:
## Load our data

data = load_dataset ("rotten_tomatoes")

In [5]:
print(data)
data["train"][0]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})


{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [19]:
data["train"][-1]

{'text': 'things really get weird , though not particularly scary : the movie is all portent and no content .',
 'label': 0}

### Text Classification with repregentation model

##### Using a task specific model

In [20]:
from transformers import pipeline
## path to out HF model

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

### load model and tokenizer in to pipeline

pipe = pipeline(
    model = model_path,
    tokenizer = model_path,
    return_all_scores = True,
    device="cuda:0"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
### Run inferance

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"] , "text")), total=len(data['test'])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)
    


00%|██████████████████████████████████| 1066/1066 [00:06<00:00, 162.26it/s]

In [22]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    "create and print classification report"
    performance = classification_report(y_true, y_pred, target_names=['Negative review', 'Positive Review'])
    print(performance)

In [24]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



#### CLassification task Using Leverage Embedings

###### Supervised Classification

In [25]:
from sentence_transformers import SentenceTransformer

## Load model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [26]:
### conver Text in to embeding

train_embeddings = model.encode(data["train"]['text'], show_progress_bar = True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar = True)

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [27]:
train_embeddings.shape

(8530, 768)

In [28]:
from sklearn.linear_model import LogisticRegression

## GTrain the logistic regression on our  train Embeddings

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])


In [30]:
###next lets evaluate out model

y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



### Zero Shot Model

In [33]:
#### Create Lable embeddingd for our labels

label_embeddings = model.encode(["A negative review", "A positive review"])

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
## find the best matcching lable for each document

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [42]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066

