In [1]:
#dependencies
#!pip install -q transformers
#!pip install pandas
#!pip install sklearn
#!pip install torch
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
#!pip install datasets
#!pip install evaluate

## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
import torch
from datasets import load_dataset
import evaluate
from evaluate import evaluator

torch.cuda.is_available()

True

### Login Notebook

In [3]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to C:\Users\Admin/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


## Load & Preprocess

In [4]:
#load the dataset and process it for training
datadf = pd.read_csv("..\\..\\scraper\\data\\yelp_review_after_subjectivity_classification.csv")


datadf.drop('content', axis=1, inplace=True)
datadf.drop('predicted_subjectivity', axis=1, inplace=True)
datadf.drop('Tokenized', axis=1, inplace=True)
datadf.rename(columns={"content_clean": "text"}, inplace=True)
datadf = datadf[['text', 'label']]

datadf.to_csv("yelp_review_processed_2L.csv", index=False)

### Test using pre-trained model

In [5]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [7]:
loadDataset = load_dataset('csv', data_files="yelp_review_processed_2L.csv")
dataEval = loadDataset['train']

task_evaluator = evaluator("text-classification")

evalResults = task_evaluator.compute(
    model_or_pipeline=model,
    tokenizer=tokenizer,
    data=dataEval,
    metric=evaluate.combine(["accuracy", "precision", "recall", "f1"]),
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
)

Using custom data configuration default-fb394bbe0e22a1f9


Downloading and preparing dataset csv/default to C:/Users/Admin/.cache/huggingface/datasets/csv/default-fb394bbe0e22a1f9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to C:/Users/Admin/.cache/huggingface/datasets/csv/default-fb394bbe0e22a1f9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
print(evalResults)

{'accuracy': 0.8796913717037007, 'precision': 0.9460922787193974, 'recall': 0.8063157049995987, 'f1': 0.8706295221177591, 'total_time_in_seconds': 212.6748084, 'samples_per_second': 233.40329009084465, 'latency_in_seconds': 0.004284429750800781}


## Fine Tune model with Trainer API

In [9]:
# from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
# traintest = load_dataset('csv', data_files="yelp_review_processed.csv")
# # split into train and test, 80% - 20%
# train_test_split = traintest["train"].train_test_split(train_size=0.8)

In [10]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# def tokenizeFunction(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

In [11]:
# #tokenize the datasets
# tok_train = train_test_split["train"].map(tokenizeFunction, batched=True)
# tok_test = train_test_split["test"].map(tokenizeFunction, batched=True)

In [12]:
# #train using pytorch trainer api, with 2 labels, Positive & Negative
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 3)

In [13]:
# #training arguments
# batch_size = 16
# training_args = TrainingArguments(
#     output_dir = "sentiment-fine-tuned-yelp-3L",
#     overwrite_output_dir = True,
#     evaluation_strategy = "epoch",
#     learning_rate = 2e-5,
#     weight_decay = 0.01,
#     per_device_train_batch_size = batch_size,
#     per_device_eval_batch_size = batch_size,
#     logging_steps = len(tok_train) // batch_size,
#     push_to_hub = True,
# )

# #compute metrics function
# metric = evaluate.load("accuracy")
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# #trainer
# trainer = Trainer(
#    model = model,
#    tokenizer = tokenizer,
#    args = training_args,
#    train_dataset = tok_train,
#    eval_dataset = tok_test,
#    compute_metrics = compute_metrics,
# )

In [14]:
# #perform the fine tuning with trainer
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
# trainer.train()

In [15]:
# trainer.evaluate()

## Fine Tune model using pytorch native method

In [16]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler, AutoTokenizer

traintest = load_dataset('csv', data_files="yelp_review_processed_2L.csv")
# split into train and test, 80% - 20%
train_test_split = traintest["train"].train_test_split(train_size=0.8)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenizeFunction(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

#tokenize the datasets
tok_train = train_test_split["train"].map(tokenizeFunction, batched=True)
tok_test = train_test_split["test"].map(tokenizeFunction, batched=True)

#prepare for torch
tok_train = tok_train.remove_columns(["text"])
tok_test = tok_test.remove_columns(["text"])
tok_train = tok_train.rename_column("label", "labels")
tok_test = tok_test.rename_column("label", "labels")

tok_train.set_format("torch")
tok_test.set_format("torch")

train_dataloader = DataLoader(tok_train, shuffle=True, batch_size=32)
eval_dataloader = DataLoader(tok_test, batch_size=32)

Using custom data configuration default-fb394bbe0e22a1f9
Found cached dataset csv (C:/Users/Admin/.cache/huggingface/datasets/csv/default-fb394bbe0e22a1f9/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [17]:
#train using pytorch, with 2 labels, Positive & Negative
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [18]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [19]:
num_epochs = 3
num_train_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name = "linear", optimizer = optimizer, num_warmup_steps = 0, num_training_steps = num_train_steps
)

In [20]:
device = torch.device("cuda")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [21]:
progress_bar = tqdm(range(num_train_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3723 [00:00<?, ?it/s]

In [22]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9367445608380338}

In [23]:
model.push_to_hub("sentiment-fine-tuned-yelp-2L")
tokenizer.push_to_hub("sentiment-fine-tuned-yelp-2L")