In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Path to the directory where the model and tokenizer were saved
model_dir = "../artifacts/model_trainer/checkpoint-8439"


# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Set device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
from datasets import load_from_disk


test = load_from_disk("../artifacts/data_transformation/Dataset/test")
test

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2500
})

In [11]:
import numpy as np
from sklearn.metrics import classification_report

In [17]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model_dir, tokenizer=tokenizer, device=0)  # device=0 for GPU, -1 for CPU

# Evaluate on test dataset
test_texts = test["text"]  # Replace with the correct key for text in your dataset
test_labels = test["label"]  # Replace with the correct key for labels in your dataset

predicted_labels = []
for text in test_texts:
    prediction = classifier(text, truncation=True)
    predicted_labels.append(int(prediction[0]["label"].split("_")[-1]))  # Extract the label (e.g., "LABEL_0" -> 0)

# Generate classification report
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted_labels, target_names=["Negative", "Positive"]))

Device set to use cuda:0


              precision    recall  f1-score   support

    Negative       0.93      0.95      0.94      1231
    Positive       0.95      0.93      0.94      1269

    accuracy                           0.94      2500
   macro avg       0.94      0.94      0.94      2500
weighted avg       0.94      0.94      0.94      2500



In [21]:
model.push_to_hub("tanmay2809/distilBERT_imdb")

model.safetensors: 100%|██████████| 268M/268M [00:10<00:00, 26.0MB/s] 


CommitInfo(commit_url='https://huggingface.co/tanmay2809/distilBERT_imdb/commit/d471b6053f4e3c2508834267d3d551fddd9a3783', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='d471b6053f4e3c2508834267d3d551fddd9a3783', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tanmay2809/distilBERT_imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='tanmay2809/distilBERT_imdb'), pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub("tanmay2809/distilBERT_imdb")

CommitInfo(commit_url='https://huggingface.co/tanmay2809/distilBERT_imdb/commit/312e2f0c1aebc4ef50f5c96082fdd080e2b8984c', commit_message='Upload tokenizer', commit_description='', oid='312e2f0c1aebc4ef50f5c96082fdd080e2b8984c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tanmay2809/distilBERT_imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='tanmay2809/distilBERT_imdb'), pr_revision=None, pr_num=None)