In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
dataset = load_dataset("imdb")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation= True,
        padding = "max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize_function,batched = True)

tokenized_dataset.set_format("torch", columns = ["input_ids","attention_mask","label"])

tokenized_train_data = tokenized_dataset["train"]
tokenized_test_data = tokenized_dataset["test"]

Map: 100%|██████████| 25000/25000 [00:04<00:00, 5602.77 examples/s]


In [6]:
from transformers import Trainer, TrainingArguments

In [7]:
training_args = TrainingArguments(
    output_dir = "./finetuned",
    evaluation_strategy = "steps",
    eval_steps = 500,
    num_train_epochs=3,
    learning_rate= 2e-5,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    weight_decay=0.01
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset= tokenized_train_data,
    eval_dataset= tokenized_test_data,
    tokenizer = tokenizer
)
trainer.train()

  trainer = Trainer(
 11%|█         | 500/4689 [03:44<31:39,  2.21it/s]

{'loss': 0.3197, 'grad_norm': 11.426033973693848, 'learning_rate': 1.7867349114949884e-05, 'epoch': 0.32}


                                                  
 11%|█         | 500/4689 [07:24<31:39,  2.21it/s] 

{'eval_loss': 0.24071145057678223, 'eval_runtime': 219.8311, 'eval_samples_per_second': 113.724, 'eval_steps_per_second': 7.11, 'epoch': 0.32}


 21%|██▏       | 1000/4689 [11:12<27:51,  2.21it/s]  

{'loss': 0.2526, 'grad_norm': 9.642156600952148, 'learning_rate': 1.5734698229899766e-05, 'epoch': 0.64}


                                                   
 21%|██▏       | 1000/4689 [14:53<27:51,  2.21it/s]

{'eval_loss': 0.20636381208896637, 'eval_runtime': 220.3785, 'eval_samples_per_second': 113.441, 'eval_steps_per_second': 7.092, 'epoch': 0.64}


 32%|███▏      | 1500/4689 [18:39<23:58,  2.22it/s]   

{'loss': 0.2228, 'grad_norm': 13.268056869506836, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.96}


                                                   
 32%|███▏      | 1500/4689 [22:20<23:58,  2.22it/s]

{'eval_loss': 0.20226521790027618, 'eval_runtime': 220.4291, 'eval_samples_per_second': 113.415, 'eval_steps_per_second': 7.091, 'epoch': 0.96}


 43%|████▎     | 2000/4689 [26:06<20:16,  2.21it/s]   

{'loss': 0.1651, 'grad_norm': 7.693418025970459, 'learning_rate': 1.1469396459799531e-05, 'epoch': 1.28}


                                                   
 43%|████▎     | 2000/4689 [29:45<20:16,  2.21it/s]

{'eval_loss': 0.25927668809890747, 'eval_runtime': 219.2337, 'eval_samples_per_second': 114.034, 'eval_steps_per_second': 7.129, 'epoch': 1.28}


 53%|█████▎    | 2500/4689 [33:29<16:15,  2.24it/s]   

{'loss': 0.1466, 'grad_norm': 21.2023983001709, 'learning_rate': 9.336745574749414e-06, 'epoch': 1.6}


                                                   
 53%|█████▎    | 2500/4689 [37:28<16:15,  2.24it/s]

{'eval_loss': 0.2298092544078827, 'eval_runtime': 238.3126, 'eval_samples_per_second': 104.904, 'eval_steps_per_second': 6.559, 'epoch': 1.6}


 64%|██████▍   | 3000/4689 [41:12<12:32,  2.24it/s]   

{'loss': 0.1492, 'grad_norm': 15.564982414245605, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.92}


                                                   
 64%|██████▍   | 3000/4689 [44:49<12:32,  2.24it/s]

{'eval_loss': 0.24223186075687408, 'eval_runtime': 217.485, 'eval_samples_per_second': 114.95, 'eval_steps_per_second': 7.187, 'epoch': 1.92}


 75%|███████▍  | 3500/4689 [48:33<08:49,  2.25it/s]   

{'loss': 0.0995, 'grad_norm': 25.693227767944336, 'learning_rate': 5.07144380464918e-06, 'epoch': 2.24}


                                                   
 75%|███████▍  | 3500/4689 [53:14<08:49,  2.25it/s]

{'eval_loss': 0.2978869080543518, 'eval_runtime': 281.1337, 'eval_samples_per_second': 88.926, 'eval_steps_per_second': 5.56, 'epoch': 2.24}


 85%|████████▌ | 4000/4689 [56:59<05:07,  2.24it/s]   

{'loss': 0.086, 'grad_norm': 0.04625948891043663, 'learning_rate': 2.9387929195990615e-06, 'epoch': 2.56}


                                                   
 85%|████████▌ | 4000/4689 [1:00:36<05:07,  2.24it/s]

{'eval_loss': 0.28855082392692566, 'eval_runtime': 217.0359, 'eval_samples_per_second': 115.188, 'eval_steps_per_second': 7.202, 'epoch': 2.56}


 96%|█████████▌| 4500/4689 [1:04:20<01:23,  2.25it/s]   

{'loss': 0.0947, 'grad_norm': 0.06217218190431595, 'learning_rate': 8.061420345489445e-07, 'epoch': 2.88}


                                                     
 96%|█████████▌| 4500/4689 [1:07:57<01:23,  2.25it/s]

{'eval_loss': 0.28094425797462463, 'eval_runtime': 217.2379, 'eval_samples_per_second': 115.081, 'eval_steps_per_second': 7.195, 'epoch': 2.88}


100%|██████████| 4689/4689 [1:09:24<00:00,  1.13it/s]  

{'train_runtime': 4164.8377, 'train_samples_per_second': 18.008, 'train_steps_per_second': 1.126, 'train_loss': 0.16769915690781034, 'epoch': 3.0}





TrainOutput(global_step=4689, training_loss=0.16769915690781034, metrics={'train_runtime': 4164.8377, 'train_samples_per_second': 18.008, 'train_steps_per_second': 1.126, 'total_flos': 9935054899200000.0, 'train_loss': 0.16769915690781034, 'epoch': 3.0})

In [14]:
# Test input
input_text = [
    "The ending of the show was absolutely terrible, it pretty much ruined the entire show. It was great until episode 7, but when they made Sang-woo betray Ali, and the main character ruin his friendship with Il-nam it quickly became one of the most horribly directed episodes I have ever watched. Plus, they purposefully showed us the good side or the soft side of some main characters just to kill them off in the most meaningless and uneventful way possible? And the whole secret detective thing on the side was such bullsh1t. At first, you think, damn, he may be onto something what a great guy, then, in the end, he ends up shaking hands with the bad guys, and now suddenly all his good intentions and aim to expose these VIP's and the Game/Front Man (which he risked his life for) are gone??? If you enjoy characters dying then coming back to life, or the winner of the game continuing to spend his life like a homeless bum loser even after winning 45 billion won, this show is for you. In case you happen to want to save your time and energy for a better show that actually makes sense, then I suggest you don't watch this show. It's definitely NOT worth the hype.",
    "So I first started watching this awesome season through my brother suggesting it too me! And what a beautiful suggestion it was. My oh my goodness this season excited me and to be brutally honest first episode I didn’t have much clue to what it was about, I hadn’t even watched a trailer for it and to be honest I don’t really like the name of the show Squid Game. But the a old saying in life don’t judge a book by its cover and I guess that old saying rings true here.",
    "No wonder this series took so long to create and it shows because there's literally not one flaw. It's perfect in every sense and has every major ingredient that makes it one of the best shows of this decade hands down! The acting; absolutely incredible. The leads acting is stunning and beautiful even without him saying anything his facial expressions speaks volumes in VERY dramatic scenes (which scenes aren't?)",
]

# Tokenize the new data
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

device = model.device
inputs = {key: val.to(device) for key, val in inputs.items()}

# Pass the tokenized inputs through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
label_map = {0: "NEGATIVE", 1: "POSITIVE"}

# Display predictions with confidence scores
import torch.nn.functional as F
probabilities = F.softmax(outputs.logits, dim=-1)

for i, predicted_label in enumerate(predicted_labels):
    churn_label = label_map[predicted_label]
    confidence = probabilities[i][predicted_label].item()
    print(f"\nInput Text {i + 1}: {input_text[i]}")
    print(f"Predicted Label: {churn_label} (Confidence: {confidence:.2f})")


Input Text 1: The ending of the show was absolutely terrible, it pretty much ruined the entire show. It was great until episode 7, but when they made Sang-woo betray Ali, and the main character ruin his friendship with Il-nam it quickly became one of the most horribly directed episodes I have ever watched. Plus, they purposefully showed us the good side or the soft side of some main characters just to kill them off in the most meaningless and uneventful way possible? And the whole secret detective thing on the side was such bullsh1t. At first, you think, damn, he may be onto something what a great guy, then, in the end, he ends up shaking hands with the bad guys, and now suddenly all his good intentions and aim to expose these VIP's and the Game/Front Man (which he risked his life for) are gone??? If you enjoy characters dying then coming back to life, or the winner of the game continuing to spend his life like a homeless bum loser even after winning 45 billion won, this show is for y

In [15]:
import evaluate

In [19]:
metric = evaluate.load("accuracy")

def compute_metrics(batch):
    inputs = {key: torch.tensor(val).to(model.device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    return predictions.cpu().numpy()

tokenized_test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

all_predictions = []
all_labels = []

for batch in torch.utils.data.DataLoader(tokenized_test_data, batch_size=32):
    predictions = compute_metrics(batch)
    all_predictions.extend(predictions)
    all_labels.extend(batch["label"].numpy())

# Compute accuracy
results = metric.compute(predictions=all_predictions, references=all_labels)
print(f"Accuracy: {results['accuracy']:.4f}")


  inputs = {key: torch.tensor(val).to(model.device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}


Accuracy: 0.9286
