In [1]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df=pd.read_csv('./IMDB_dataset/IMDB dataset.csv')

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)
df = df.rename(columns={"review": "text", "sentiment":"label"}) 

train, test= train_test_split(df, test_size=0.2, random_state=42)

In [3]:
dataset_train = Dataset.from_pandas(train, preserve_index=False)
dataset_test = Dataset.from_pandas(test, preserve_index=False)

In [4]:
dataset_train

Dataset({
    features: ['text', 'label'],
    num_rows: 40000
})

In [5]:
dataset_train[1]

{'text': "I did not watch the entire movie. I could not watch the entire movie. I stopped the DVD after watching for half an hour and I suggest anyone thinking of watching themselves it stop themselves before taking the disc out of the case.<br /><br />I like Mafia movies both tragic and comic but Corky Romano can only be described as a tragic attempt at a mafia comedy.<br /><br />The problem is Corky Romano simply tries too hard to get the audience to laugh, the plot seems to be an excuse for moving Chris Kattan (Corky) from one scene to another. Corky himself is completely overplayed and lacks subtlety or credulity - all his strange mannerisms come across as contrived - Chris Kattan is clearly 'acting' rather than taking a role - it bounces you right out of the story. Each scene is utterly predictable, the 'comedic event' that will occur on the set is obvious as soon as each scene is introduced. In comedies such as Mr. Bean the disasters caused by the title character are funny becaus

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def tokenize(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True)

In [8]:
dataset_train_tokenized = dataset_train.map(tokenize,batched=True)
dataset_test_tokenized = dataset_test.map(tokenize,batched=True)

                                                                   

In [9]:
dataset_train_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 40000
})

In [10]:
dataset_test_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [11]:
small_train_dataset = dataset_train_tokenized.shuffle(seed=42).select(range(25000))
#small_eval_dataset = dataset_test_tokenized.shuffle(seed=42).select(range(10000))

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

print(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Trainer does not automatically evaluate model performance during training. You’ll need to pass Trainer a function to compute and report metrics.

Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: Accuracy = (TP + TN) / (TP + TN + FP + FN) Where: TP: True positive TN: True negative FP: False positive FN: False negative

accuracy_metric = evaluate.load("accuracy")

results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])

print(results)

{'accuracy': 1.0}


In [13]:
metric = evaluate.load("accuracy")

axis = -1

This means that the index that will be returned by argmax will be taken from the last axis.

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments(output_dir="bert_trainer", 
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  num_train_epochs = 2,
                                  learning_rate = 2e-5)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=dataset_test_tokenized,
    compute_metrics=compute_metrics,
)

In [17]:
torch.cuda.is_available()

True

In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.3249,0.36757,0.9132
2,0.1807,0.348561,0.9298


TrainOutput(global_step=12500, training_loss=0.2793598193359375, metrics={'train_runtime': 7366.9472, 'train_samples_per_second': 6.787, 'train_steps_per_second': 1.697, 'total_flos': 6623369932800000.0, 'train_loss': 0.2793598193359375, 'epoch': 2.0})

In [19]:
trainer.save_model("./bert_trainer/")

In [None]:
tokenizer.save_pretrained('./bert_trainer/tokenizer/')

In [5]:
model_loaded = AutoModelForSequenceClassification.from_pretrained("./bert_trainer/")

In [6]:
tokenizer2 = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
from transformers import pipeline

In [7]:
sentiment_model = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer2, top_k=None)


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [25]:
text1 = "Very BAD movie!"


In [26]:
sentiment_model(text1)

[[{'label': 'LABEL_0', 'score': 0.999428927898407},
  {'label': 'LABEL_1', 'score': 0.0005710835102945566}]]

In [28]:
text1 = "Very Good movie!"
sentiment_model(text1)

[[{'label': 'LABEL_0', 'score': 0.00046284840209409595},
  {'label': 'LABEL_1', 'score': 0.9995372295379639}]]

In [33]:
text1 = "BAD film!"
res = sentiment_model(text1)[0]

In [34]:
res

[{'label': 'LABEL_0', 'score': 0.9995126724243164},
 {'label': 'LABEL_1', 'score': 0.00048729247646406293}]

In [35]:
for sentiment in res:
    if sentiment['label'] == 'LABEL_1':
        pos = sentiment['score']
    elif sentiment['label'] == 'LABEL_0':
        neg = sentiment['score']

In [36]:
neg

0.9995126724243164

In [37]:
pos

0.00048729247646406293