In [19]:
import pandas as pd
custom=pd.read_csv("Finetune - give as python list (1).csv")

In [20]:
custom.head()

Unnamed: 0,Feedback,Priority
0,"Deposit of ₹10,000 not reflecting in my accoun...",1
1,Mutual fund purchase debited from my bank acco...,1
2,Dividend payout from stock I sold missing from...,1
3,Unable to withdraw funds. Stuck in 'Processing...,1
4,Suspicious activity alert for unauthorized sto...,1


In [21]:
custom.shape

(147, 2)

In [23]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd

# Load your DataFrame
# For example, assuming 'custom' is your DataFrame
# custom = pd.read_csv("your_dataset.csv")

# Extract text and labels from the DataFrame
texts = custom["Feedback"].tolist()
labels = custom["Priority"].tolist()

# Tokenize the texts using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_texts = tokenizer(texts, padding=True, truncation=True)

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.tokenized_texts["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.tokenized_texts["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

    def __len__(self):
        return len(self.labels)

# Split the dataset into training and evaluation sets
train_size = int(0.8 * len(texts))
eval_size = len(texts) - train_size
train_texts, eval_texts, train_labels, eval_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize the training and evaluation sets
train_tokenized_texts = tokenizer(train_texts, padding=True, truncation=True)
eval_tokenized_texts = tokenizer(eval_texts, padding=True, truncation=True)

# Create DataLoader for training and evaluation
train_dataset = CustomDataset(train_tokenized_texts, train_labels)
eval_dataset = CustomDataset(eval_tokenized_texts, eval_labels)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(set(labels)))

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 6

# Fine-tuning loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} Training"):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Training Loss: {average_loss}")

    # Evaluation
    model.eval()
    total_eval_loss = 0

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc=f"Epoch {epoch + 1} Evaluation"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

    average_eval_loss = total_eval_loss / len(eval_dataloader)
    print(f"Epoch {epoch + 1}, Evaluation Loss: {average_eval_loss}")

# Save the fine-tuned model
model.save_pretrained("fine_tune_distilbert_model")
tokenizer.save_pretrained("fine_tune_distilbert_tokenizer")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

Epoch 1, Training Loss: 0.6529857665300369


Epoch 1 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]


Epoch 1, Evaluation Loss: 0.5387446284294128


Epoch 2 Training: 100%|██████████| 4/4 [00:09<00:00,  2.28s/it]


Epoch 2, Training Loss: 0.4881639629602432


Epoch 2 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  1.92it/s]


Epoch 2, Evaluation Loss: 0.37635114789009094


Epoch 3 Training: 100%|██████████| 4/4 [00:10<00:00,  2.58s/it]


Epoch 3, Training Loss: 0.38164122402668


Epoch 3 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


Epoch 3, Evaluation Loss: 0.2769585847854614


Epoch 4 Training: 100%|██████████| 4/4 [00:10<00:00,  2.63s/it]


Epoch 4, Training Loss: 0.29586440324783325


Epoch 4 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


Epoch 4, Evaluation Loss: 0.2252878099679947


Epoch 5 Training: 100%|██████████| 4/4 [00:08<00:00,  2.22s/it]


Epoch 5, Training Loss: 0.2478528805077076


Epoch 5 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  2.19it/s]


Epoch 5, Evaluation Loss: 0.18241405487060547


Epoch 6 Training: 100%|██████████| 4/4 [00:09<00:00,  2.30s/it]


Epoch 6, Training Loss: 0.20912672951817513


Epoch 6 Evaluation: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]


Epoch 6, Evaluation Loss: 0.156773179769516


('fine_tune_distilbert_tokenizer\\tokenizer_config.json',
 'fine_tune_distilbert_tokenizer\\special_tokens_map.json',
 'fine_tune_distilbert_tokenizer\\vocab.txt',
 'fine_tune_distilbert_tokenizer\\added_tokens.json')

In [27]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the fine-tuned model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("fine_tune_distilbert_model")
tokenizer = DistilBertTokenizer.from_pretrained("fine_tune_distilbert_tokenizer")

# Example of new input text
new_input_text = "Buy option is not working cannot place order"

# Tokenize the new input text
tokenized_input = tokenizer(new_input_text, padding=True, truncation=True, return_tensors="pt")

# Make the prediction
model.eval()
with torch.no_grad():
    input_ids = tokenized_input["input_ids"]
    attention_mask = tokenized_input["attention_mask"]
    outputs = model(input_ids, attention_mask=attention_mask)

# Get the predicted label
logits = outputs.logits
predicted_label = torch.argmax(logits, dim=1).item()

print(f"Predicted Label: {predicted_label}")


Predicted Label: 1
