In [None]:
# !pip install torch torchvision transformers datasets

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: accelerate 1.6.0
Uninstalling accelerate-1.6.0:
  Successfully uninstalled accelerate-1.6.0
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Using cached accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-1.6.0 transformers-4.51.3


## Data Ingestion

There are 8 unique intent labels in the dataset:
1. Intent_Lease_Abstraction

2. Intent_Comparison_LOI_Lease

3. Intent_Clause_Protect

4. Intent_Company_research

5. Intent_Transaction_Date_navigator

6. Intent_Amendment_Abstraction

7. Intent_Sales_Listings_Comparison

8. Intent_Lease_Listings_Comparison


In [None]:
# !pip install datasets

In [None]:
from datasets import load_dataset

# Load local CSV file
raw_dataset = load_dataset(
    'csv',
    data_files={
        'train': './intent_train_dataset.csv',
        'validation': './intent_test_dataset.csv'
    }
)

# Check the loaded dataset
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['email_text', 'intent'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['email_text', 'intent'],
        num_rows: 800
    })
})


## Data preprocesing
(DistilBERT requires tokenized inputs, and Hugging Face makes this straightforward.)

- Tokenization: Use the DistilBertTokenizer to convert text into token IDs.
- Truncation and Padding: Ensure all sequences are the same length by truncating longer texts and padding shorter ones.
- Batching: Group your data into batches for faster processing.


In [None]:
from transformers import DistilBertTokenizer

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

label2id = {
    "Intent_Lease_Abstraction": 0,
    "Intent_Comparison_LOI_Lease": 1,
    "Intent_Clause_Protect": 2,
    "Intent_Company_research": 3,
    "Intent_Transaction_Date_navigator": 4,
    "Intent_Amendment_Abstraction": 5,
    "Intent_Sales_Listings_Comparison": 6,
    "Intent_Lease_Listings_Comparison": 7,
}

id2label = {v: k for k, v in label2id.items()}

# Preprocessing function with label mapping
def preprocess_function(examples):
    # Tokenize the text field
    tokenized = tokenizer(examples['email_text'], padding="max_length", truncation=True)

    # Convert class labels to integers
    tokenized["labels"] = [label2id[label] for label in examples["intent"]]
    return tokenized

# Apply preprocessing
encoded_dataset = raw_dataset.map(preprocess_function, batched=True)

# Check processed data
print(encoded_dataset['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'email_text': 'Kindly abstract the lease document (attached) for the Miller Plaza Project location. Focus on rent, term, and landlord info.', 'intent': 'Intent_Lease_Abstraction', 'input_ids': [101, 19045, 10061, 1996, 10084, 6254, 1006, 4987, 1007, 2005, 1996, 4679, 8232, 2622, 3295, 1012, 3579, 2006, 9278, 1010, 2744, 1010, 1998, 18196, 18558, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Load base model

In [None]:
from transformers import DistilBertForSequenceClassification

# Load DistilBERT model for classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##  Fine-Tuning DistilBERT

In [None]:
# Training Args
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving results
    eval_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=5e-5,              # Initial learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Regularization
    logging_dir="./logs",            # Directory for logs
    logging_steps=10                 # Log every 10 steps
)

## Training the Model

In [None]:
# !pip install wandb


In [None]:
# Monitoring Training
# import wandb
# wandb.login()  # Log in to your account

from transformers import Trainer

trainer = Trainer(
    model=model,                          # The DistilBERT model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset['train'],  # Training data
    eval_dataset=encoded_dataset['validation']  # Validation data
)

# Start training
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtvggamermax[0m ([33mtvggamermax-na[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0013,0.089437
2,0.0005,0.091196
3,0.0004,0.090211


TrainOutput(global_step=1500, training_loss=0.03576590119938677, metrics={'train_runtime': 1143.1293, 'train_samples_per_second': 20.995, 'train_steps_per_second': 1.312, 'total_flos': 3179557748736000.0, 'train_loss': 0.03576590119938677, 'epoch': 3.0})

## Model Evaluation
- When evaluating a fine-tuned model like DistilBERT, metrics like accuracy and F1-score are essential — but they only tell part of the story.

- Take a deeper dive into classification reports and confusion matrices. These give you a clear picture of where the model excels and where it struggles.

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions
predictions = trainer.predict(encoded_dataset['validation'])
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = encoded_dataset['validation']['labels']

# Generate a classification report
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       1.00      0.77      0.87       100
           1       1.00      1.00      1.00       100
           2       0.81      1.00      0.90       100
           3       1.00      1.00      1.00       100
           4       1.00      1.00      1.00       100
           5       1.00      1.00      1.00       100
           6       0.98      1.00      0.99       100
           7       1.00      0.98      0.99       100

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800



In [None]:
# Error analysis

# Inspect misclassified examples
for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)):
    if true != pred:
        print(f"Example {i}:")
        print(f"Text: {encoded_dataset['validation']['email_text'][i]}")
        print(f"True Label: {true}, Predicted Label: {pred}")


Example 0:
Text: Provide an abstraction for the attached lease related to Boone Light Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 1:
Text: Provide an abstraction for the attached lease related to Johns Drives Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 3:
Text: Provide an abstraction for the attached lease related to Reyes Summit Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 5:
Text: Provide an abstraction for the attached lease related to Patricia Way Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 18:
Text: Provide an abstraction for the attached lease related to Nicholas Ridge Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 25:
Text: Provide an abstraction for the attached lease related to Reynolds Pine Project. Key terms and clauses needed.
True Label: 0, Predicted Label: 2
Example 27:
Text: Provide an abstrac

In [None]:
# Save the model and tokenizer
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.txt',
 './finetuned_model/added_tokens.json')

In [None]:
# Load them back when needed for inference
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
model = DistilBertForSequenceClassification.from_pretrained("./finetuned_model")
tokenizer = DistilBertTokenizer.from_pretrained("./finetuned_model")

In [None]:
# # Inference
# from transformers import pipeline

# # Load pipeline
# classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# # Sample email
# email_text = "Please extract the rent and renewal terms from the lease document for 45 Pine St."

# # Run inference
# predictions = classifier(email_text)

# # Show prediction
# print(predictions)

In [None]:
import torch

def predict(text: str):
    # Check if CUDA is available and set the device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the same device
    model.to(device)

    # Tokenize the input text and move to the same device
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Ensure input tensors are on the same device

    # Perform prediction
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(axis=1).item()

    return {"text": text, "prediction": predictions}

In [None]:
email_text = "Could you do a background check on Wexford Corp before we proceed? I’m particularly interested in any public disputes or bankruptcies in the past 5 years."
pred = predict(email_text)
pred_class = id2label[pred["prediction"]]
print(pred, "\n" ,pred_class)

{'text': 'Could you do a background check on Wexford Corp before we proceed? I’m particularly interested in any public disputes or bankruptcies in the past 5 years.', 'prediction': 3} 
 Intent_Company_research
