In [3]:
# https://www.youtube.com/watch?v=GSt00_-0ncQ

In [None]:
# Pipeline

In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [3]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
def predict(text):
    results = classifier([text])
    
    for result in results:
        return result['label']

In [None]:
predict("Hello, world!")

In [None]:
# Model and Tokenizer

In [2]:
import torch
import torch.nn.functional as func

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokens = tokenizer.tokenize("Hello, world!")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("Hello, world!")
print(tokens)
print(input_ids)
print(token_ids)

['hello', ',', 'world', '!']
{'input_ids': [101, 7592, 1010, 2088, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}
[7592, 1010, 2088, 999]


In [9]:
X_train = [
    "Hello, world!"
]

In [11]:
batch = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
print(batch)

{'input_ids': tensor([[ 101, 7592, 1010, 2088,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [None]:
with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    predictions = func.softmax(outputs.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1)
    print(labels)
    labels = [model.config.id2label(label_id) for label_id in labels.tolist()]
    print(labels)

In [None]:
saved_dir = "saved"
tokenizer.save_pretrained(saved_dir)
model.save_pretrained(saved_dir)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(saved_dir)
model = AutoModelForSequenceClassification.from_pretrained(saved_dir)

In [4]:
# Model Hub

In [5]:
model_name = "oliverguhr/german-sentiment-bert"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
X_train = ["Mit keinem auten Ecaebnis", "Das war unfair", "Das ist gar nicht mal so gut",
"nicht so schlecht wie erwartet", "Das war gut!", "Sie fährt ein grünes Auto. "]

In [None]:
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print (batch)

with torch.no_grad:
    outputs = model (**batch)
    label_ids = torch.argmax(outputs, dim=1)
    labels = [model.config.id2label[label_id] for label_id in label_ids.tolist()]
    print (label_ids)
    print(labels)

In [1]:
# Finetune 

In [7]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [2]:
model_name = "distilbert-base-uncased"

In [3]:
def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts= []
    Labels = []
    
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir). iterdir():
            texts. append (text_file.read_text())
            labels. append (0 if label_dir == "neg" else 1)
    return texts, Labels

In [None]:
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
class IMDbDataset (Dataset):
    def __init__(self, encodings, Rakels) :
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch. tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch. tensor(self.labels[idx])
        return item
    
    def __len__(self) :
        return len(self.labels)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset (test_encodings, test_labels)

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64, 
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_name)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()