<a href="https://colab.research.google.com/github/Sidhtang/bert-project/blob/main/fine_tuning_distil_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai==0.28



In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.1

In [8]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load and preprocess data
with open('/content/file_updated (1).json', 'r') as f:
    data = json.load(f)

# Convert data format to match the custom dataset
formatted_data = [{'input': item['query'], 'output': item['label']} for item in data]

train_data, val_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

# Create a custom dataset class
class QueryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer.encode_plus(
            item['input'],
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(1 if item['output'] == 'Personalization' else 0, dtype=torch.long)
        }

# Initialize tokenizer and model
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare datasets and dataloaders
train_dataset = QueryDataset(train_data, tokenizer)
val_dataset = QueryDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Set up optimizer and device
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().tolist())
            val_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_loader)

    # Calculate metrics
    accuracy = accuracy_score(val_labels, val_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1-score: {f1:.4f}")
    print("--------------------")

# Save the model
torch.save(model.state_dict(), 'query_relevance_model.pth')
print("Model saved successfully.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/15
Training Loss: 0.6904
Validation Loss: 0.6928
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/15
Training Loss: 0.6840
Validation Loss: 0.6919
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/15
Training Loss: 0.6809
Validation Loss: 0.6910
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/15
Training Loss: 0.6882
Validation Loss: 0.6900
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/15
Training Loss: 0.6768
Validation Loss: 0.6884
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/15
Training Loss: 0.6801
Validation Loss: 0.6857
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/15
Training Loss: 0.6724
Validation Loss: 0.6820
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 8/15
Training Loss: 0.6639
Validation Loss: 0.6763
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 9/15
Training Loss: 0.6420
Validation Loss: 0.6676
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 10/15
Training Loss: 0.6500
Validation Loss: 0.6547
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 11/15
Training Loss: 0.6419
Validation Loss: 0.6355
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 12/15
Training Loss: 0.5970
Validation Loss: 0.6066
Validation Accuracy: 0.5000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------
Epoch 13/15
Training Loss: 0.5917
Validation Loss: 0.5654
Validation Accuracy: 0.6667
Validation Precision: 1.0000
Validation Recall: 0.3333
Validation F1-score: 0.5000
--------------------
Epoch 14/15
Training Loss: 0.5397
Validation Loss: 0.5072
Validation Accuracy: 0.9167
Validation Precision: 1.0000
Validation Recall: 0.8333
Validation F1-score: 0.9091
--------------------
Epoch 15/15
Training Loss: 0.4888
Validation Loss: 0.4339
Validation Accuracy: 0.9167
Validation Precision: 1.0000
Validation Recall: 0.8333
Validation F1-score: 0.9091
--------------------
Model saved successfully.


In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def load_model(model_path, model_name='distilroberta-base'):
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load the model architecture
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Load the trained weights
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    # Set the model to evaluation mode
    model.eval()

    return model, tokenizer

def predict(model, tokenizer, query):
    # Tokenize the input query
    encoding = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Make sure we're not tracking gradients
    with torch.no_grad():
        # Get the model's prediction
        outputs = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])
        prediction = torch.argmax(outputs.logits, dim=1)

    # Return the result as a string
    return "Personalization" if prediction.item() == 1 else "Customer_support"

# Example usage
if __name__ == "__main__":
    # Load the model and tokenizer
    model_path = 'query_relevance_model.pth'
    model, tokenizer = load_model(model_path)

    # Example queries
    queries = [
        "I want to find a gift for my sister, but I don’t know what would suit her style. Can you recommend something?",
        "Can you recommend something in your store that looks similar to this jacket I found on another website?",
        "Can you suggest shoes that match this outfit picture I uploaded",
        "What is the return policy for online purchases?",
        "Can you help me figure out how to cancel my order?",
        "I’m looking for products that can be delivered within 3 days. Can you assist with that?",
        "I can’t seem to find this shirt in the size I need. Could you check availability for me?",
        "Is the blue tshirt available in size medium",
         "Can I get some suggestions for professional wear based on my previous purchases?"


    ]

    # Make predictions
    for query in queries:
        result = predict(model, tokenizer, query)
        print(f"Query: {query}")
        print(f"Prediction: {result}")
        print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


Query: I want to find a gift for my sister, but I don’t know what would suit her style. Can you recommend something?
Prediction: Personalization

Query: Can you recommend something in your store that looks similar to this jacket I found on another website?
Prediction: Personalization

Query: Can you suggest shoes that match this outfit picture I uploaded
Prediction: Personalization

Query: What is the return policy for online purchases?
Prediction: Customer_support

Query: Can you help me figure out how to cancel my order?
Prediction: Customer_support

Query: I’m looking for products that can be delivered within 3 days. Can you assist with that?
Prediction: Customer_support

Query: I can’t seem to find this shirt in the size I need. Could you check availability for me?
Prediction: Customer_support

Query: Is the blue tshirt available in size medium
Prediction: Personalization

Query: Can I get some suggestions for professional wear based on my previous purchases?
Prediction: Personaliz