In [11]:
from google.colab import userdata
key = userdata.get('hugging-Face')

In [4]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [20]:
model_checkpoint = 'distilbert-base-uncased'

# Defining labels

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative" : 0, "Positive" : 1}

# Generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Loading Dataset
dataset = load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [22]:
import pandas as pd
pd.DataFrame.from_dict(dataset)[:5]

Unnamed: 0,train,validation
0,"{'label': 1, 'text': '. . . or type on a compu...","{'label': 1, 'text': 'Disgused as an Asian Hor..."
1,"{'label': 1, 'text': 'During 1933 this film ha...","{'label': 1, 'text': 'I am from Texas and my f..."
2,"{'label': 0, 'text': 'Let me be clear. I've us...","{'label': 0, 'text': 'Robert Altman's ""Quintet..."
3,"{'label': 1, 'text': 'Carlos Mencia was excell...","{'label': 1, 'text': '** HERE BE SPOILERS ** <..."
4,"{'label': 1, 'text': 'I was initially dubious ...","{'label': 1, 'text': 'I first saw this movie i..."


In [24]:
# Preprocessing the data
# Creating
# Converting the data into numerical encodings for the nueral network
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

    # Create tokenize function
def tokenize_function(examples):
  # Extract Text
  text = examples['text']
  # Tokenize and truncate text
  tokenizer.truncation_side = 'left'
  tokenized_inputs = tokenizer(
      text,
      return_tensors="np",
      truncation=True,
      max_length=512
    )
  return tokenized_inputs

# Add pad token if none exsists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

# Tokenize traning and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [25]:
# Creae a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
# Evaluation Metrics
# Importing accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# Defining an evaluation function to pass into trainer
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  return {'accuracy': accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [28]:
# Defining a list of parameters:
text_list = [
    'It was a great movie',
    'It was not a great movie',
    'Mediocre at best',
    'Terrible',
    'Out of this world!'
]

In [31]:
print("Pretrained Model")
print("==================")
for text in text_list:
  # convert to numerical encodings
  inputs = tokenizer.encode(text, return_tensors="pt")
  # Compute logits
  logits = model(inputs).logits
  # Convert logits to labels
  predictions = torch.argmax(logits)

  print(text + " : " + id2label[predictions.tolist()])

Pretrained Model
It was a great movie : Positive
It was not a great movie : Positive
Mediocre at best : Positive
Terrible : Positive
Out of this world! : Positive


In [32]:
# Training the model
peft_config = LoraConfig(
    task_type="SEQ_CLS", # Sequence classificaiton
    r=4, # Intrinsic rank of trainable weight matrix
    lora_alpha=32, # this is like a learning rate
    lora_dropout=0.02, # probablity of dropout
    target_modules=["q_lin"] # we apply lora to query layer
)

In [33]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [35]:
# hyperparameters
lr = .001
batch_size = 4
num_epochs = 10

# Define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [39]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.390263,{'accuracy': 0.875}
2,0.434600,0.399643,{'accuracy': 0.881}
3,0.434600,0.628509,{'accuracy': 0.882}
4,0.204700,0.698072,{'accuracy': 0.879}
5,0.204700,0.785604,{'accuracy': 0.871}
6,0.075000,0.751284,{'accuracy': 0.889}
7,0.075000,0.807574,{'accuracy': 0.889}
8,0.037100,0.871158,{'accuracy': 0.886}
9,0.037100,0.915674,{'accuracy': 0.882}
10,0.005500,0.924302,{'accuracy': 0.889}


Trainer is attempting to log a value of "{'accuracy': 0.875}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.882}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.879}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.871}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.15138279113769532, metrics={'train_runtime': 472.4855, 'train_samples_per_second': 21.165, 'train_steps_per_second': 5.291, 'total_flos': 1112883852759936.0, 'train_loss': 0.15138279113769532, 'epoch': 10.0})

In [46]:
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to('cuda') # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits.to('cuda')
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was a great movie - Positive
It was not a great movie - Positive
Mediocre at best - Negative
Terrible - Negative
Out of this world! - Positive
