<a href="https://colab.research.google.com/github/Petersonp/FitBot/blob/main/FitBot_NLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install accelerate==0.20.1
!pip install transformers[torch] -U

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/23-24/CS 4701/intent.csv')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1)

# You would also want to process your labels if they are not already in numeric form
# For example, mapping 'GetNutritionInfo' to 0, 'CreateWorkoutPlan' to 1, etc.
label_to_id = {label: i for i, label in enumerate(df['label'].unique())}
id_to_label = {i: label for label, i in label_to_id.items()}
num_intents = len(label_to_id)
print(df)

                                                  text                label
0       How many calories should I eat to lose weight?  CreateNutritionPlan
1              What should my daily protein intake be?  CreateNutritionPlan
2    What's my target calorie intake if I want to g...  CreateNutritionPlan
3    Plan my macros for a weight loss goal of 10 po...  CreateNutritionPlan
4      Need a diet plan that fits 3000 daily calories.  CreateNutritionPlan
..                                                 ...                  ...
410           Give me an exercise that targets biceps.    RecommendExercise
411       Suggest an exercise for improving endurance.    RecommendExercise
412        What are some exercises for a flat stomach?    RecommendExercise
413  Recommend a beginner-friendly exercise for wei...    RecommendExercise
414  Show me an exercise to increase upper body str...    RecommendExercise

[415 rows x 2 columns]


In [None]:
print(id_to_label)

{0: 'CreateNutritionPlan', 1: 'CreateWorkoutPlan', 2: 'RecommendMeal', 3: 'RecommendExercise', 4: 'QueryFood', 5: 'QueryExercise', 6: 'Unrelated'}


In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=128)


In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert the labels to numeric form
train_labels = train_df['label'].map(label_to_id).tolist()
val_labels = val_df['label'].map(label_to_id).tolist()

# Create the dataset objects
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)


In [None]:
# Load a pre-trained model for sequence classification with the number of labels in your dataset
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

training_args = TrainingArguments(
    output_dir='./results',          # where to save model checkpoints
    num_train_epochs=17,              # number of epochs
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps
    weight_decay=0.01,               # weight decay rate
    evaluation_strategy='epoch',     # evaluate after each epoch
    logging_dir='./logs',            # where to store logs
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,2.002736
2,No log,1.926759
3,No log,1.844813
4,No log,1.808089
5,No log,1.768617
6,No log,1.710552
7,No log,1.600455
8,No log,1.468989
9,No log,1.28576
10,No log,1.08518


TrainOutput(global_step=204, training_loss=1.171887341667624, metrics={'train_runtime': 31.2622, 'train_samples_per_second': 202.832, 'train_steps_per_second': 6.525, 'total_flos': 68433015857850.0, 'train_loss': 1.171887341667624, 'epoch': 17.0})

In [None]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.5746512413024902,
 'eval_runtime': 0.0566,
 'eval_samples_per_second': 741.417,
 'eval_steps_per_second': 17.653,
 'epoch': 17.0}

In [None]:
# Assuming that CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)
# Example text
test_text = "how many calories in 2 lb of chicken"

# Encode the text and move the inputs to the same device as the model
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

# Now the rest of your code should work without device mismatch


# Get predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Extract the logits
logits = outputs.logits

# Convert logits to probabilities
probabilities = torch.softmax(logits, dim=-1)

# Get the predicted label
predicted_label_id = torch.argmax(probabilities, dim=-1).item()
predicted_label = id_to_label[predicted_label_id]

print(f"Input: {test_text}")
print(f"Predicted intent: {predicted_label}")


Input: how many calories in 2 lb of chicken
Predicted intent: QueryFood


In [None]:
# Save the model and tokenizer to disk

model.save_pretrained('/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model')
tokenizer.save_pretrained('/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model')

('/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model/tokenizer_config.json',
 '/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model/special_tokens_map.json',
 '/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model/vocab.txt',
 '/content/drive/MyDrive/23-24/CS 4701/Intent Trained Model/added_tokens.json')