In [None]:
import numpy as np

import torch
import datasets
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from datasets import load_metric
from huggingface_hub import notebook_login
from transformers import TrainingArguments, Trainer, pipeline
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, recall_score, precision_score, \
          accuracy_score, fbeta_score, f1_score, \
          roc_auc_score, average_precision_score, \
          log_loss, PrecisionRecallDisplay, RocCurveDisplay

from scikitplot.metrics import plot_roc
import seaborn as sns

from tqdm import tqdm

import joblib

True

# Preprocess data

In [None]:
imdb = datasets.load_dataset("imdb")

# Create a smaller training dataset for faster training times (But finally get the whole data to get accurate results
small_train_dataset = imdb["train"].shuffle(seed=42)
small_test_dataset = imdb["test"].shuffle(seed=42)
print(small_train_dataset[0])
print(small_test_dataset[0])

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [None]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Baseline

In [None]:
imdb['train'].info

DatasetInfo(description='Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.', citation='@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n', homepage='http://ai.stanford.edu/~amaas/data/sen

In [None]:
imdb['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
train_df = pd.DataFrame(imdb['train'])
train_df

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [None]:
test_df = pd.DataFrame(imdb['test'])
test_df

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0
...,...,...
24995,Just got around to seeing Monster Man yesterda...,1
24996,I got this as part of a competition prize. I w...,1
24997,I got Monster Man in a box set of three films ...,1
24998,"Five minutes in, i started to feel how naff th...",1


In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [None]:
vectorized = vectorizer.fit_transform(train_df['text'])
vectorized

<25000x1513832 sparse matrix of type '<class 'numpy.int64'>'
	with 8765469 stored elements in Compressed Sparse Row format>

In [None]:
LOAD_SAVED = True

if not LOAD_SAVED:
  clf = LogisticRegression(
      random_state=0,
      solver="saga",
      penalty="elasticnet",
      l1_ratio=0.2
  )
  clf.fit(vectorized, train_df['label'])

  joblib.dump(clf, "./logistic_reg_model.pkl")

else:
  clf = joblib.load("./logistic_reg_model.pkl")

In [None]:
vectorized = vectorizer.transform(test_df['text'])
preds = clf.predict(vectorized)

preds

array([0, 0, 0, ..., 1, 0, 1])

In [None]:
test_df['baseline_pred'] = preds

In [None]:
y_pred_class = preds
test_labels = test_df['label']

tn, fp, fn, tp = confusion_matrix(test_labels, y_pred_class).ravel()
false_positive_rate = fp / (fp + tn)
false_negative_rate = fn / (tp + fn)
true_negative_rate = tn / (tn + fp)
recall = recall_score(test_labels, y_pred_class) # or optionally tp / (tp + fn)
precision = precision_score(test_labels, y_pred_class)
accuracy = accuracy_score(test_labels, y_pred_class)
f1= f1_score(test_labels, y_pred_class)
f2 = fbeta_score(test_labels, y_pred_class, beta = 2)
roc_auc = roc_auc_score(test_labels, preds)
avg_precision = average_precision_score(test_labels, preds)
loss = log_loss(test_labels, preds)


print("true positive", tp)
print("true negative", tn)
print("false positive", fp)
print("false negative", fn)
print("false positive rate", false_positive_rate)
print("false negetive rate", false_negative_rate)
print("true negetive rate", true_negative_rate)
print("recall", recall)
print("precision", precision)
print("accuracy", accuracy)
print("f1", f1)
print("f2", f2)
print("roc_auc", roc_auc)
print("avg_precision", avg_precision)
print("loss", loss)

true positive 11088
true negative 10895
false positive 1605
false negative 1412
false positive rate 0.1284
false negetive rate 0.11296
true negetive rate 0.8716
recall 0.88704
precision 0.8735523516899079
accuracy 0.87932
f1 0.8802445123645458
f2 0.8843092530266538
roc_auc 0.8793200000000001
avg_precision 0.8313558780430159
loss 4.16819086949284


In [None]:
tn, fp, fn, tp

(10895, 1605, 1412, 11088)

# 3. Training the model

In [None]:
# Define DistilBERT as our base model:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [None]:
# Define the evaluation metrics 
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
# Define a new Trainer with all the objects we constructed so far

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/praptishadmaan/finetuning-sentiment-model-3000-samples into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.47k/255M [00:00<?, ?B/s]

Download file runs/Apr22_15-22-25_87e5a6c2753f/1650640962.9494507/events.out.tfevents.1650640962.87e5a6c2753f.…

Download file runs/Apr24_16-01-04_2687154e0c27/1650816181.247773/events.out.tfevents.1650816181.2687154e0c27.7…

Download file runs/Apr22_15-22-25_87e5a6c2753f/events.out.tfevents.1650646290.87e5a6c2753f.72.2: 100%|########…

Download file runs/Apr22_15-22-25_87e5a6c2753f/events.out.tfevents.1650640962.87e5a6c2753f.72.0:  78%|#######7…

Download file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650816181.2687154e0c27.73.0:  78%|#######7…

Clean file runs/Apr22_15-22-25_87e5a6c2753f/1650640962.9494507/events.out.tfevents.1650640962.87e5a6c2753f.72.…

Clean file runs/Apr22_15-22-25_87e5a6c2753f/events.out.tfevents.1650646290.87e5a6c2753f.72.2: 100%|##########|…

Download file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650819153.2687154e0c27.73.2: 100%|########…

Clean file runs/Apr24_16-01-04_2687154e0c27/1650816181.247773/events.out.tfevents.1650816181.2687154e0c27.73.1…

Download file training_args.bin: 100%|##########| 3.05k/3.05k [00:00<?, ?B/s]

Clean file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650816181.2687154e0c27.73.0:  22%|##2       |…

Clean file runs/Apr22_15-22-25_87e5a6c2753f/events.out.tfevents.1650640962.87e5a6c2753f.72.0:  22%|##2       |…

Clean file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650819153.2687154e0c27.73.2: 100%|##########|…

Clean file training_args.bin:  33%|###2      | 1.00k/3.05k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [None]:
# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3126


Step,Training Loss
500,0.3276
1000,0.2441
1500,0.2206
2000,0.1506
2500,0.142
3000,0.147


Saving model checkpoint to finetuning-sentiment-model-3000-samples/checkpoint-1563
Configuration saved in finetuning-sentiment-model-3000-samples/checkpoint-1563/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/checkpoint-1563/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-3000-samples/checkpoint-1563/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/checkpoint-1563/special_tokens_map.json
tokenizer config file saved in finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/special_tokens_map.json
Saving model checkpoint to finetuning-sentiment-model-3000-samples/checkpoint-3126
Configuration saved in finetuning-sentiment-model-3000-samples/checkpoint-3126/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/checkpoint-3126/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-mo

TrainOutput(global_step=3126, training_loss=0.20298403863790931, metrics={'train_runtime': 2500.6238, 'train_samples_per_second': 19.995, 'train_steps_per_second': 1.25, 'total_flos': 6559860094356864.0, 'train_loss': 0.20298403863790931, 'epoch': 2.0})

In [None]:
# Compute the evaluation metrics
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

{'epoch': 2.0,
 'eval_accuracy': 0.93192,
 'eval_f1': 0.9323583180987203,
 'eval_loss': 0.23451189696788788,
 'eval_runtime': 471.6702,
 'eval_samples_per_second': 53.003,
 'eval_steps_per_second': 3.314}

# 4. Analyzing new data with the model

In [None]:
# Upload the model to the Hub
trainer.push_to_hub()

Saving model checkpoint to finetuning-sentiment-model-3000-samples
Configuration saved in finetuning-sentiment-model-3000-samples/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/255M [00:00<?, ?B/s]

Upload file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650819153.2687154e0c27.73.2: 100%|##########…

Upload file runs/Apr24_16-01-04_2687154e0c27/events.out.tfevents.1650816181.2687154e0c27.73.0:  75%|#######4  …

To https://huggingface.co/praptishadmaan/finetuning-sentiment-model-3000-samples
   d7e0faa..e7cce29  main -> main

To https://huggingface.co/praptishadmaan/finetuning-sentiment-model-3000-samples
   e7cce29..07c0b81  main -> main



'https://huggingface.co/praptishadmaan/finetuning-sentiment-model-3000-samples/commit/e7cce29d763d67410457a3bb40dc92f1e80b221f'

In [None]:
sentiment_model = pipeline(model="praptishadmaan/finetuning-sentiment-model-3000-samples")
# Dry run on 2 sentences
sentiment_model(["I love this movie", "This movie sucks!"])

Downloading:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.9963579773902893},
 {'label': 'LABEL_0', 'score': 0.994658887386322}]

In [None]:
def convert_to_val(d):
  if d['label'] == 'LABEL_0':
    return 0.5 + d['score']/2
  elif d['label'] == 'LABEL_1':
    return 0.5 - d['score']/2


def data():
    for item in test_df['text'].tolist():
        yield item

pred = []
for out in tqdm(sentiment_model(data(), truncation=True), total=len(test_df['text'])):
  pred.append(convert_to_val(out))

pred = np.array(pred)

pred

100%|██████████| 25000/25000 [3:03:11<00:00,  2.27it/s]


array([0.99861482, 0.99419072, 0.9975678 , ..., 0.04292768, 0.00970873,
       0.00347394])

In [None]:
y_pred_class = pred < 0.5
test_labels = test_df['label']

tn, fp, fn, tp = confusion_matrix(test_labels, y_pred_class).ravel()
false_positive_rate = fp / (fp + tn)
false_negative_rate = fn / (tp + fn)
true_negative_rate = tn / (tn + fp)
recall = recall_score(test_labels, y_pred_class) # or optionally tp / (tp + fn)
precision = precision_score(test_labels, y_pred_class)
accuracy = accuracy_score(test_labels, y_pred_class)
f1= f1_score(test_labels, y_pred_class)
f2 = fbeta_score(test_labels, y_pred_class, beta = 2)
loss = log_loss(test_labels, pred, labels=[1, 0])


print("false positive rate", false_positive_rate)
print("false negetive rate", false_negative_rate)
print("true negetive rate", true_negative_rate)
print("recall", recall)
print("precision", precision)
print("accuracy", accuracy)
print("f1", f1)
print("f2", f2)
print("loss", loss)

false positive rate 0.07456
false negetive rate 0.0616
true negetive rate 0.92544
recall 0.9384
precision 0.926393934607487
accuracy 0.93192
f1 0.9323583180987203
f2 0.9359739555073251
loss 5.527715519242778


In [None]:
test_df['fancy_pred'] = 1 - pred

In [None]:
test_df

Unnamed: 0,text,label,baseline_pred,fancy_pred
0,I love sci-fi and am willing to put up with a ...,0,0,0.998615
1,"Worth the entertainment value of a rental, esp...",0,0,0.994191
2,its a totally average film with a few semi-alr...,0,0,0.997568
3,STAR RATING: ***** Saturday Night **** Friday ...,0,0,0.998674
4,"First off let me say, If you haven't enjoyed a...",0,1,0.003929
...,...,...,...,...
24995,Just got around to seeing Monster Man yesterda...,1,1,0.004169
24996,I got this as part of a competition prize. I w...,1,1,0.053084
24997,I got Monster Man in a box set of three films ...,1,1,0.042928
24998,"Five minutes in, i started to feel how naff th...",1,0,0.009709


In [None]:
test_df.to_csv("./res.csv")

In [None]:
tn

11568

In [None]:
fp

932

In [None]:
fn

770

In [None]:
tp

11730