In [1]:
!pip install torch torchvision transformers datasets

Collecting torchvision
  Downloading torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.22.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: accelerate 1.6.0
Uninstalling accelerate-1.6.0:
  Successfully uninstalled accelerate-1.6.0
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Using cached accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-1.6.0 transformers-4.51.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available

## Data Ingestion

There are 8 unique intent labels in the dataset:
1. Intent_Lease_Abstraction

2. Intent_Comparison_LOI_Lease

3. Intent_Clause_Protect

4. Intent_Company_research

5. Intent_Transaction_Date_navigator

6. Intent_Amendment_Abstraction

7. Intent_Sales_Listings_Comparison

8. Intent_Lease_Listings_Comparison


In [2]:
from datasets import load_dataset

# Load local CSV file
raw_dataset = load_dataset(
    'csv',
    data_files={
        'train': './intent_train_dataset.csv',
        'validation': './intent_test_dataset.csv'
    }
)

# Check the loaded dataset
print(raw_dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'class'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['text', 'class'],
        num_rows: 400
    })
})


## Data preprocesing
(DistilBERT requires tokenized inputs, and Hugging Face makes this straightforward.)

- Tokenization: Use the DistilBertTokenizer to convert text into token IDs.
- Truncation and Padding: Ensure all sequences are the same length by truncating longer texts and padding shorter ones.
- Batching: Group your data into batches for faster processing.


In [10]:
from transformers import DistilBertTokenizer

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Extract unique labels
unique_labels = sorted(set(raw_dataset['train']['class']))
label2id = {label: idx for idx, label in enumerate(unique_labels)}

# Preprocessing function with label mapping
def preprocess_function(examples):
    # Tokenize the text field
    tokenized = tokenizer(examples['text'], padding="max_length", truncation=True)
    
    # Convert class labels to integers
    tokenized["labels"] = [label2id[label] for label in examples["class"]]
    return tokenized

# Apply preprocessing
encoded_dataset = raw_dataset.map(preprocess_function, batched=True)

# Check processed data
print(encoded_dataset['train'][0])


Map: 100%|██████████| 1600/1600 [00:00<00:00, 4507.82 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 4823.27 examples/s]

{'text': 'Can you check the lease and send back the Midtown Center for rent and duration?', 'class': 'Intent_Lease_Abstraction', 'input_ids': [101, 2064, 2017, 4638, 1996, 10084, 1998, 4604, 2067, 1996, 27219, 2415, 2005, 9278, 1998, 9367, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,




## Load base model

In [11]:
from transformers import DistilBertForSequenceClassification

# Load DistilBERT model for classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##  Fine-Tuning DistilBERT

In [12]:
# Training Args
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving results
    eval_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=5e-5,              # Initial learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Regularization
    logging_dir="./logs",            # Directory for logs
    logging_steps=10                 # Log every 10 steps
)

## Training the Model

In [13]:
# !pip install wandb


In [14]:
# Monitoring Training
import wandb
wandb.login()  # Log in to your account

from transformers import Trainer

trainer = Trainer(
    model=model,                          # The DistilBERT model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset['train'],  # Training data
    eval_dataset=encoded_dataset['validation']  # Validation data
)

# Start training
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/tejasgadi/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtvggamermax[0m ([33mtvggamermax-na[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Model Evaluation
- When evaluating a fine-tuned model like DistilBERT, metrics like accuracy and F1-score are essential — but they only tell part of the story.

- Take a deeper dive into classification reports and confusion matrices. These give you a clear picture of where the model excels and where it struggles.

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions
predictions = trainer.predict(encoded_dataset['test'])
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = encoded_dataset['test']['labels']

# Generate a classification report
print(classification_report(true_labels, predicted_labels))

In [None]:
# Error analysis

# Inspect misclassified examples
for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)):
    if true != pred:
        print(f"Example {i}:")
        print(f"Text: {encoded_dataset['test']['sentence'][i]}")
        print(f"True Label: {true}, Predicted Label: {pred}")


In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
# Load them back when needed for inference
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_model")