<a href="https://colab.research.google.com/github/Sidhtang/bert-project/blob/main/fine_tuning_distil_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai==0.28



In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [7]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.

In [9]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load and preprocess data
with open('fashion_query_training_data.json', 'r') as f:
    data = json.load(f)

# Convert data format to match the custom dataset
formatted_data = [{'input': item['query'], 'output': item['label']} for item in data]

train_data, val_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

# Create a custom dataset
class QueryDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer.encode_plus(
            item['input'],
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(1 if item['output'] == 'Relevant' else 0, dtype=torch.long)
        }

# Initialize tokenizer and model
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare datasets and dataloaders
train_dataset = QueryDataset(train_data, tokenizer)
val_dataset = QueryDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Set up optimizer and device
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().tolist())
            val_labels.extend(labels.cpu().tolist())

    # Calculate metrics
    accuracy = accuracy_score(val_labels, val_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1-score: {f1:.4f}")

# Save the model
torch.save(model.state_dict(), 'query_relevance_model.pth')
print("Model saved successfully.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Validation Accuracy: 0.8750
Validation Precision: 0.8077
Validation Recall: 1.0000
Validation F1-score: 0.8936
Epoch 2/5
Validation Accuracy: 0.9000
Validation Precision: 0.8400
Validation Recall: 1.0000
Validation F1-score: 0.9130
Epoch 3/5
Validation Accuracy: 0.9250
Validation Precision: 0.8750
Validation Recall: 1.0000
Validation F1-score: 0.9333
Epoch 4/5
Validation Accuracy: 1.0000
Validation Precision: 1.0000
Validation Recall: 1.0000
Validation F1-score: 1.0000
Epoch 5/5
Validation Accuracy: 0.9000
Validation Precision: 0.8400
Validation Recall: 1.0000
Validation F1-score: 0.9130
Model saved successfully.


In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def load_model(model_path, model_name='distilroberta-base'):
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load the model architecture
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Load the trained weights
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    # Set the model to evaluation mode
    model.eval()

    return model, tokenizer

def predict(model, tokenizer, query):
    # Tokenize the input query
    encoding = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Make sure we're not tracking gradients
    with torch.no_grad():
        # Get the model's prediction
        outputs = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])
        prediction = torch.argmax(outputs.logits, dim=1)

    # Return the result as a string
    return "Relevant" if prediction.item() == 1 else "Not Relevant"

# Example usage
if __name__ == "__main__":
    # Load the model and tokenizer
    model_path = 'query_relevance_model.pth'
    model, tokenizer = load_model(model_path)

    # Example queries
    queries = [
        "Do you have any floral summer dresses in stock?",
        "What's your return policy for online purchases?",
        "Can you recommend a good restaurant nearby?",
        "Are these jeans available in size 32?",
        "Is the black leather jacket still available in medium?",
        "What's your favorite color?",
        "When will the new winter collection be released?",
        "How do I change my account password?",
        "Is the blue tshirt available in size medium"
    ]

    # Make predictions
    for query in queries:
        result = predict(model, tokenizer, query)
        print(f"Query: {query}")
        print(f"Prediction: {result}")
        print()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


Query: Do you have any floral summer dresses in stock?
Prediction: Relevant

Query: What's your return policy for online purchases?
Prediction: Not Relevant

Query: Can you recommend a good restaurant nearby?
Prediction: Not Relevant

Query: Are these jeans available in size 32?
Prediction: Relevant

Query: Is the black leather jacket still available in medium?
Prediction: Relevant

Query: What's your favorite color?
Prediction: Not Relevant

Query: When will the new winter collection be released?
Prediction: Relevant

Query: How do I change my account password?
Prediction: Not Relevant

Query: Is the blue tshirt available in size medium
Prediction: Relevant

