## Install libraries

In [9]:
!pip uninstall tokenizers -y 
!pip install transformers
!pip install tokenizers

Found existing installation: tokenizers 0.13.3
Uninstalling tokenizers-0.13.3:
  Successfully uninstalled tokenizers-0.13.3
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [10]:
from transformers import pipeline

## Sentimental Analysis

In [11]:
clf = pipeline("sentiment-analysis")
result = clf("what a beautiful daty!")[0]
print("Sentimental Analysis result: %s, Sentiment score: %0.4f" % (result['label'], result['score']))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Text Generator

In [15]:
text_generator = pipeline("text-generation")
result = text_generator("Alice was beginning to get very tired of sitting by her sister on the bank,")
print(result[0]['generated_text'])

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Alice was beginning to get very tired of sitting by her sister on the bank, and she was a woman. What about if she was married like that? But she was a woman for a very long time, with her sister.

"And


## AutoClasses and Tokenizer

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [17]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

### Provide very similar text from GPT3

In [35]:
input_sentence ="She swiftly navigated through the bustling city streets."
target_sequence = "She deftly maneuvered amidst the crowded urban roads."

### Tokenization

In [36]:
tokens = tokenizer(input_sentence, target_sequence, return_tensors="pt")

In [37]:
logits = model(**tokens).logits

In [38]:
results = torch.softmax(logits, dim=1).tolist()[0]

In [39]:
for i, label in enumerate(['no', 'yes']):
    print(f"{label}: {int(round(results[i] * 100))}%")

no: 11%
yes: 89%


In [45]:
target_sequence = "He got lost in the tranquility of the quiet country lanes."
tokens = tokenizer(input_sentence, target_sequence, return_tensors="pt")
logits = model(**tokens).logits
result = torch.softmax(logits, dim=1).tolist()[0]

for i, label in enumerate(['no', 'yes']):
    print(f"{label}: {int(round(result[i] * 100))}%")

no: 77%
yes: 23%


## movie_reviews

In [47]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np

nltk.download('movie_reviews')

fileids = movie_reviews.fileids()

[nltk_data] Downloading package movie_reviews to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [48]:
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]

In [49]:
label_dict = {'pos':1, 'neg':0}
y = np.array([label_dict[c] for c in categories])

X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size = 0.2, random_state=7)

In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [52]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [54]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [55]:
model = model.to(device)

In [57]:
batch_size = 10
y_pred = []
num_batch = len(y_test)//batch_size

In [64]:
for i in range(num_batch):
    inputs = tokenizer(
        X_test[i*batch_size:(i+1)*batch_size],
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    inputs = inputs.to(device)
    logits = model(**inputs).logits
    pred = F.softmax(logits, dim=-1)
    results = pred.cpu().detach().numpy().argmax(axis=1)
    y_pred.extend(results.tolist())
    
torch.cuda.empty_cache()
score = sum(y_test ==np.array(y_pred))/len(y_test)
print("NLTK score:", score)

NLTK score: 0.8425
