<a href="https://colab.research.google.com/github/Sidhtang/bert-project/blob/main/fine_tuning_distil_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai==0.28



In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.1

In [3]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load and preprocess data
with open('/content/file_updated.json', 'r') as f:
    data = json.load(f)

# Convert data format to match the custom dataset
formatted_data = [{'input': item['query'], 'output': item['label']} for item in data]

train_data, val_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

# Create a custom dataset class
class QueryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer.encode_plus(
            item['input'],
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(1 if item['output'] == 'Personalization' else 0, dtype=torch.long)
        }

# Initialize tokenizer and model
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare datasets and dataloaders
train_dataset = QueryDataset(train_data, tokenizer)
val_dataset = QueryDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Set up optimizer and device
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().tolist())
            val_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_loader)

    # Calculate metrics
    accuracy = accuracy_score(val_labels, val_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1-score: {f1:.4f}")
    print("--------------------")

# Save the model
torch.save(model.state_dict(), 'query_relevance_model.pth')
print("Model saved successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/15
Training Loss: 0.6878
Validation Loss: 0.6753
Validation Accuracy: 0.6538
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/15
Training Loss: 0.6856
Validation Loss: 0.6608
Validation Accuracy: 0.6538
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1-score: 0.0000
--------------------
Epoch 3/15
Training Loss: 0.6726
Validation Loss: 0.6445
Validation Accuracy: 0.6731
Validation Precision: 1.0000
Validation Recall: 0.0556
Validation F1-score: 0.1053
--------------------
Epoch 4/15
Training Loss: 0.6214
Validation Loss: 0.6136
Validation Accuracy: 0.7308
Validation Precision: 0.5625
Validation Recall: 1.0000
Validation F1-score: 0.7200
--------------------
Epoch 5/15
Training Loss: 0.5369
Validation Loss: 0.5643
Validation Accuracy: 0.7115
Validation Precision: 0.5455
Validation Recall: 1.0000
Validation F1-score: 0.7059
--------------------
Epoch 6/15
Training Loss: 0.3868
Validation Loss: 0.3838
Validation Accuracy: 0.8462
Validation Precision: 0.7083
Validation Recall: 0.9444
Validation F1-score: 0.8095
--------------------
Epoch 7/15
Training Loss: 0.2786
Validation Loss: 0.343

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def load_model(model_path, model_name='distilroberta-base'):
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load the model architecture
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Load the trained weights
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    # Set the model to evaluation mode
    model.eval()

    return model, tokenizer

def predict(model, tokenizer, query):
    # Tokenize the input query
    encoding = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Make sure we're not tracking gradients
    with torch.no_grad():
        # Get the model's prediction
        outputs = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])
        prediction = torch.argmax(outputs.logits, dim=1)

    # Return the result as a string
    return "Personalization" if prediction.item() == 1 else "Customer_support"

# Example usage
if __name__ == "__main__":
    # Load the model and tokenizer
    model_path = 'query_relevance_model.pth'
    model, tokenizer = load_model(model_path)

    # Example queries
    queries = [
        "Do you have any floral summer dresses in stock?",
        "What's your return policy for online purchases?",
        "Can you recommend a good restaurant nearby?",
        "Are these jeans available in size 32?",
        "Is the black leather jacket still available in medium?",
        "What's your favorite color?",
        "When will the new winter collection be released?",
        "How do I change my account password?",
        "Is the blue tshirt available in size medium"
    ]

    # Make predictions
    for query in queries:
        result = predict(model, tokenizer, query)
        print(f"Query: {query}")
        print(f"Prediction: {result}")
        print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


Query: Do you have any floral summer dresses in stock?
Prediction: Personalization

Query: What's your return policy for online purchases?
Prediction: Customer_support

Query: Can you recommend a good restaurant nearby?
Prediction: Customer_support

Query: Are these jeans available in size 32?
Prediction: Personalization

Query: Is the black leather jacket still available in medium?
Prediction: Personalization

Query: What's your favorite color?
Prediction: Customer_support

Query: When will the new winter collection be released?
Prediction: Personalization

Query: How do I change my account password?
Prediction: Customer_support

Query: Is the blue tshirt available in size medium
Prediction: Personalization

