In [None]:
!pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [None]:
imdb["test"][0]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function (examples):
  return tokenizer (examples["text"], truncation=True)



In [None]:

tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
tokenized_imdb["train"]

In [None]:
tokenized_imdb[ "test"]


In [None]:
from datasets import Dataset

In [None]:
positive_samples = tokenized_imdb["train"].filter (lambda x: x["label"] == 1)
negative_samples = tokenized_imdb["train"]. filter (lambda x: x["label"] == 0)
negative_samples

In [None]:
balanced_positives = positive_samples.shuffle(seed=42).select(range(1500))
balanced_negatives = negative_samples.shuffle(seed=42).select(range(1500))
# balanced_positives
balanced_negatives

In [None]:
balanced_train_dataset = Dataset.from_dict({
    "text":balanced_positives["text"] + balanced_negatives ["text"],
    "label":balanced_positives["label"] + balanced_negatives ["label"],
    "input_ids":balanced_positives["input_ids"] + balanced_negatives["input_ids"],
    "attention_mask":balanced_positives ["attention_mask"] + balanced_negatives ["attention_mask"],
})

In [None]:
balanced_train_dataset = balanced_train_dataset.shuffle(seed = 42)
balanced_train_dataset

In [None]:
positive_test_samples = tokenized_imdb["test"]. filter (lambda x: x["label"] == 1)
negative_test_samples = tokenized_imdb["test"]. filter (lambda x: x["label"] == 0)

In [None]:
balanced_positives_test = positive_test_samples.shuffle(seed=42).select(range(1000))
balanced_negatives_test = negative_test_samples.shuffle(seed=42).select(range(1000))

In [None]:
balanced_test_dataset = Dataset.from_dict({
    "text": balanced_positives_test["text"] + balanced_negatives_test["text"],
    "label": balanced_positives_test["label"] + balanced_negatives_test["label"],
    "input_ids": balanced_positives_test["input_ids"] + balanced_negatives_test["input_ids"],
    "attention_mask": balanced_positives_test["attention_mask"] + balanced_negatives_test["attention_mask"],
})





In [None]:
balanced_test_dataset = balanced_test_dataset.shuffle(seed=42)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

In [None]:
def compute_metrics (eval_pred) :
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)




In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments,Trainer


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)


In [None]:
import os
from transformers import TrainingArguments, Trainer

In [None]:
os.environ[ "WANDB_DISABLED" ] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    run_name = "my_training_run",
    report_to = "none"
)





In [None]:
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=balanced_train_dataset,
    eval_dataset=balanced_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)





In [None]:
trainer.train()

In [None]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."


In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis", model="/content/my_awesome_model/checkpoint-376")
classifier(text)

In [None]:
text = "The movie was too bad"
classifier(text)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import seaborn as sns
# Get model predictions
predictions = trainer.predict(balanced_test_dataset)
# Extract true labels and predicted probabilities
y_true = np. array (predictions. label_ids)
y_prob = predictions.predictions[:, 1] # Assuming second column is the probability for class 1
y_pred = np.argmax(predictions.predictions, axis=1) # Get class predictions
# Compute confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
# Plot confusion matrix
plt. figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative","Positive"])
plt.xlabel("Predicted Label")
plt. ylabel ("True Label")
plt.title("Confusion Matrix")
plt. show()

In [None]:
# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt. figure(figsize=(7, 6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC curve (AUC = {roc_auc: 3f}) ")
plt. plot([0, 1], [0, 1], color="gray", linestyle="--") # Diagonal line
plt. xlabel("False Positive Rate")
plt. ylabel("True Positive Rate")
plt. title("Receiver Operating Characteristic (ROC) Curve")
plt.legend (loc="lower right")
plt. show()

In [None]:
# Print classification report


print("Classification Report: \n")
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))

In [None]:
!git clone https://github.com/SrinidhiMaringanti/Text_Classification_With_BERT.git

In [None]:
!mv