In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import json

# Load the entire JSON file
with open('/content/drive/MyDrive/Dataset/umap2020_IARD_Cai.json', 'r') as f:
    data = json.load(f)  # Load the entire file

# Inspect the structure of the JSON file
print(type(data))  # Check the type of the root element (dict or list)

# Print a sample of the data
if isinstance(data, list):
    sample_data = data[0]  # Get the first element if it's a list
elif isinstance(data, dict):
    sample_data = {k: data[k] for k in list(data)[:2]}  # Get the first 5 key-value pairs if it's a dictionary
else:
    sample_data = data  # In case it's something else (not common)

print(json.dumps(sample_data, indent=4))  # Pretty print the sample data


<class 'dict'>
{
    "474": {
        "accepted_recommendation": [
            13
        ],
        "dialogue_info": {
            "S1": {
                "utterance_pos": 1,
                "worker_id": 1,
                "role": "seeker",
                "utterance_text": "Hi can you help me find a movie to watch",
                "top-level intent/action": [
                    "OTH",
                    "AskForRec"
                ],
                "sub-intent/action": [
                    "OTH",
                    "IQU"
                ]
            },
            "R2": {
                "utterance_pos": 2,
                "worker_id": 16,
                "role": "recommender",
                "utterance_text": "Yes, how about @187028 <It  (2017)>  ?",
                "top-level intent/action": [
                    "Recommend"
                ],
                "sub-intent/action": [
                    "REC-E"
                ]
            },
            "S3": {
            

In [2]:
import pandas as pd
import json

# Load the entire JSON file
with open('/content/drive/MyDrive/Dataset/umap2020_IARD_Cai.json', 'r') as f:
    data = json.load(f)  # Load the entire file

# Initialize lists to store data
session_ids = []
turn_ids = []
roles = []
utterance_texts = []
top_level_intents = []
# sub_intents = []

# Iterate over each session
for session_id, session_data in data.items():
    # Iterate over dialogue turns
    dialogue_info = session_data['dialogue_info']
    for turn_id, turn_data in dialogue_info.items():
        session_ids.append(session_id)
        turn_ids.append(turn_id)
        roles.append(turn_data['role'])
        utterance_texts.append(turn_data['utterance_text'])
        top_level_intents.append(", ".join(turn_data['top-level intent/action']))
        # sub_intents.append(", ".join(turn_data['sub-intent/action']))

# Create a DataFrame
df = pd.DataFrame({
    'Session ID': session_ids,
    'Turn ID': turn_ids,
    'Role': roles,
    'Utterance Text': utterance_texts,
    'Top-level Intent/Action': top_level_intents,
    # 'Sub-intent/Action': sub_intents
})

In [3]:
df.head()

Unnamed: 0,Session ID,Turn ID,Role,Utterance Text,Top-level Intent/Action
0,474,S1,seeker,Hi can you help me find a movie to watch,"OTH, AskForRec"
1,474,R2,recommender,"Yes, how about @187028 <It (2017)> ?",Recommend
2,474,S3,seeker,I don't really like horror movies what about t...,GiveFeedback
3,474,R4,recommender,@203424 <The Silence of the Lambs (1991)>,Recommend
4,474,S5,seeker,I have seen it and enjoyed it...One of my favo...,"GiveFeedback, AddDetails"


In [4]:
seeker_df = df[df['Role'] == 'seeker'].reset_index(drop=True)
seeker_df.head()

Unnamed: 0,Session ID,Turn ID,Role,Utterance Text,Top-level Intent/Action
0,474,S1,seeker,Hi can you help me find a movie to watch,"OTH, AskForRec"
1,474,S3,seeker,I don't really like horror movies what about t...,GiveFeedback
2,474,S5,seeker,I have seen it and enjoyed it...One of my favo...,"GiveFeedback, AddDetails"
3,474,S7,seeker,That is okay I just do not really like the @16...,AddDetails
4,474,S9,seeker,That is a lot of peoples favorite I just had a...,AddDetails


In [15]:
pip install transformers[torch]



In [None]:
import os
os._exit(00)

In [5]:
pip install nlpaug



In [6]:
pip install transformers



In [7]:
pip install sacremoses



In [8]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
import random

# Ensure that the device is set to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define intents
intent_classes = ['AskForRec', 'GiveFeedback', 'AddDetails', 'OTH']

# Split data
X_train, X_test, y_train, y_test = train_test_split(seeker_df['Utterance Text'], seeker_df['Top-level Intent/Action'], test_size=0.2, random_state=42)

# Augmentation functions
synonym_aug = naw.SynonymAug(aug_src='wordnet')
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de',
    to_model_name='facebook/wmt19-de-en',
    device=device.type
)

def synonym_augment(text):
    return synonym_aug.augment(text)

def back_translation_augment(text):
    return back_translation_aug.augment(text)

augmenters = {
    'synonym': synonym_augment,
    'back_translation': back_translation_augment
}

# Apply augmentations
def augment_text(text, augmenters, num_augs=2):
    augmented_texts = [text]
    for _ in range(num_augs):
        aug_type = random.choice(list(augmenters.keys()))
        augmented_text = augmenters[aug_type](text)
        augmented_texts.append(augmented_text)
    return augmented_texts

# Augment the training data
X_train_augmented = X_train.apply(lambda x: pd.Series(augment_text(x, augmenters)))
X_train_augmented = X_train_augmented.melt().dropna()['value']
y_train_augmented = pd.concat([y_train] * (len(X_train_augmented) // len(X_train)))

# Combine original and augmented data
X_train_combined = pd.concat([X_train, X_train_augmented])
y_train_combined = pd.concat([y_train, y_train_augmented])

# Reset index
X_train_combined.reset_index(drop=True, inplace=True)
y_train_combined.reset_index(drop=True, inplace=True)

print("Original Data Size:", len(X_train))
print("Augmented Data Size:", len(X_train_combined))

# Save the augmented dataset to a CSV file
augmented_df = pd.DataFrame({'Utterance Text': X_train_combined, 'Top-level Intent/Action': y_train_combined})
augmented_df.to_csv('augmented_dataset.csv', index=False)


Using device: cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

  self.pid = os.fork()
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Data Size: 1808
Augmented Data Size: 7232


In [9]:
import pandas as pd
import re

# Read the CSV file
augmented_df = pd.read_csv('augmented_dataset.csv')

def remove_at_numbers(text):
    return re.sub(r'@\d+', '', text)

# Preprocess the text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s\[\]_]', '', text)
    return text

# Apply the function to the 'Utterance Text' column
augmented_df['Utterance Text'] = augmented_df['Utterance Text'].apply(remove_at_numbers)

# Apply normalization
augmented_df['Utterance Text'] = augmented_df['Utterance Text'].apply(preprocess_text)

# Save the cleaned data back to a CSV file
augmented_df.to_csv('cleaned_augmented_dataset.csv', index=False)

print("CSV file cleaned and saved as 'cleaned_augmented_dataset.csv'")

CSV file cleaned and saved as 'cleaned_augmented_dataset.csv'


In [10]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Ensure that the device is set to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define intents
intent_classes = ['AskForRec', 'GiveFeedback', 'AddDetails', 'OTH']

# Load the augmented dataset
augmented_df = pd.read_csv('cleaned_augmented_dataset.csv')

# Split data
X_train_combined = augmented_df['Utterance Text']
y_train_combined = augmented_df['Top-level Intent/Action']
X_train, X_test, y_train, y_test = train_test_split(X_train_combined, y_train_combined, test_size=0.2, random_state=42)

class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

def prepare_dataset_for_intent(intent, X, y):
    labels = [1 if intent in label else 0 for label in y]
    return IntentDataset(X, pd.Series(labels), tokenizer, max_len=128)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Function to compute metrics
def compute_metrics(p):
    preds = p.predictions >= 0.5
    labels = p.label_ids
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

# Custom Trainer to handle class weights
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(self.args.device)
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(-1)

        # Calculate weighted loss
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=self.class_weights.to(self.args.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Initialize dictionary to store models
models = {}
class_weights = {}

# Calculate class weights
total_samples = len(y_train_combined)
for intent in intent_classes:
    intent_count = sum([1 for labels in y_train_combined if intent in labels])
    class_weights[intent] = total_samples / (len(intent_classes) * intent_count)

# Train a separate model for each intent
for intent in intent_classes:
    print(f"Training model for intent: {intent}")

    # Prepare dataset for current intent using the combined data
    train_dataset = prepare_dataset_for_intent(intent, X_train, y_train)
    test_dataset = prepare_dataset_for_intent(intent, X_test, y_test)

    # Define model for current intent
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

    # Adjust class weights
    class_weight = torch.tensor([class_weights[intent]])

    # Trainer for current intent
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        class_weights=class_weight  # Use class_weights here instead of class_weight
    )

    # Train the model
    trainer.train()

    # Add trained model to dictionary
    models[intent] = model


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Training model for intent: AskForRec


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2384,0.183721,0.944713,0.944056
2,0.093,0.134865,0.978576,0.978436
3,0.0012,0.132584,0.98065,0.980359
4,0.0925,0.09242,0.984796,0.984796
5,0.0002,0.144841,0.985487,0.985348
6,0.0003,0.087698,0.990325,0.990311
7,0.0001,0.085535,0.990325,0.990325
8,0.0,0.157286,0.984105,0.984001
9,0.0,0.130727,0.986869,0.986763
10,0.0,0.132413,0.986869,0.986763


Training model for intent: GiveFeedback


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1709,0.17266,0.923981,0.923886
2,0.0253,0.135034,0.953697,0.953647
3,0.0009,0.113854,0.970974,0.970966
4,0.0065,0.069903,0.981341,0.981355
5,0.0435,0.098568,0.978576,0.97858
6,0.0006,0.134108,0.977885,0.977843
7,0.0008,0.117518,0.978576,0.97858
8,0.0002,0.098378,0.983414,0.983414
9,0.0001,0.104257,0.98065,0.980655
10,0.0022,0.10281,0.982723,0.982725


Training model for intent: AddDetails


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4507,0.5412,0.850035,0.811659
2,0.3477,0.334471,0.931583,0.931249
3,0.4314,0.23062,0.953006,0.953259
4,0.0804,0.295685,0.965446,0.964568
5,0.0002,0.243748,0.972357,0.972122
6,0.0015,0.290998,0.968901,0.968285
7,0.0001,0.213624,0.977885,0.977566
8,0.0003,0.234907,0.97443,0.974305
9,0.0,0.23525,0.977885,0.977633
10,0.0,0.234603,0.978576,0.978379


Training model for intent: OTH


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1827,0.133415,0.96199,0.96192
2,0.0236,0.100422,0.973048,0.972893
3,0.0023,0.109051,0.973739,0.973843
4,0.0015,0.129373,0.975812,0.975754
5,0.0002,0.09873,0.979959,0.979921
6,0.0006,0.124608,0.979959,0.979855
7,0.0009,0.136764,0.978576,0.97849
8,0.0,0.107092,0.982032,0.981964
9,0.0,0.11878,0.981341,0.981276
10,0.0022,0.120777,0.981341,0.981276


In [11]:
import pickle

# Save the models dictionary to a PKL file
with open('/content/drive/MyDrive/Dataset/Intent_Recognition_BERT.pkl', 'wb') as f:
    pickle.dump(models, f)


In [12]:

# Function to predict intents
def predict_intent(text, threshold=0.5):
    # Tokenize input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Initialize list to store predicted intents
    predicted_intents = []

    # Iterate through each intent
    for intent in intent_classes:
        model = models[intent]

        # Move tensors to device
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get model predictions
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)

        # Compute probabilities and classify based on threshold
        prob = torch.sigmoid(logits).item()
        if prob >= threshold:
            predicted_intents.append(intent)

    return predicted_intents

# Example texts to predict intents for
texts = [
    "Can you recommend a good movie to watch tonight?", #AskForRec
    "What movie would you suggest for a fun night in?", #AskForRec
    "I'm looking for a new movie to watch. Any suggestions?", #AskForRec
    "Can you help me find a great action movie?", #AskForRec
    "I need a good movie to watch with my friends. Any ideas?", #AskForRec
    "I'm in the mood for a comedy.", #AddDetails
    "I loved 'Inception'. Any similar movies you can recommend?", #AddDetails
    "What's a good family movie for us to watch together?", #AddDetails
    "I prefer movies with a strong female lead.", #AddDetails
    "Can you suggest a horror movie that isn't too scary?", #AddDetails, #OTH
    "I watched 'The Dark Knight' and it was amazing. Thanks!", #GiveFeedback
    "The movie you suggested was great! Any more like that?", #GiveFeedback
    "I didn't really enjoy 'The Notebook'.", #GiveFeedback
    "That sci-fi movie was too complicated for me.", #GiveFeedback
    "Thanks for recommending 'Toy Story'. The kids loved it.", #GiveFeedback
    "What's your favorite movie?", #OTH
    "Have you seen any good movies lately?", #OTH
    "I'm bored. Any movie suggestions?", #OTH
    "Do you know any movies that are similar to 'Pulp Fiction'?", #OTH
    "What was the last movie you watched?", #OTH
    "Can you recommend a thriller? I loved 'Se7en'.", #AskForRec + AddDetails
    "What’s a good romantic comedy? Something like 'When Harry Met Sally'.", #AskForRec + AddDetails
    "I’m looking for a good animated film. Any suggestions?", #AskForRec + AddDetails
    "I want to watch a classic movie. Any recommendations from the 80s?", #AskForRec + AddDetails
    "Can you suggest a good drama? I really enjoyed 'The Shawshank Redemption'.", #AskForRec + AddDetails
    "Can you recommend another action movie? I loved 'Die Hard'."
    "The horror movie you suggested was perfect. Any more like that?", #AddDetails + GiveFeedback
    "I enjoyed the last comedy you recommended. Can you suggest another one?", #AddDetails + GiveFeedback
    "That drama was great. Any similar movies?", #AddDetails + GiveFeedback
    "The animated film was a hit. Any more suggestions?", #AddDetails + GiveFeedback
    "I loved 'Inception'. Can you recommend another mind-bending thriller?", #AskForRec + AddDetails + GiveFeedback
    "The comedy you suggested was hilarious. Any more funny movies like 'Superbad'?", #AskForRec + AddDetails + GiveFeedback
    "I really enjoyed 'The Matrix'. Can you suggest another sci-fi movie with a lot of action?", #AskForRec + AddDetails + GiveFeedback
    "The drama was touching. Can you recommend another emotional movie like 'A Beautiful Mind'?", #AskForRec + AddDetails + GiveFeedback
    "I liked 'Finding Nemo'. Can you suggest another great animated film?" #AskForRec + AddDetails + GiveFeedback
]


for text in texts:
    intents = predict_intent(text)
    print(f"Text: {text}\nPredicted Intents: {intents}\n")


Text: Can you recommend a good movie to watch tonight?
Predicted Intents: ['AskForRec', 'OTH']

Text: What movie would you suggest for a fun night in?
Predicted Intents: ['AskForRec']

Text: I'm looking for a new movie to watch. Any suggestions?
Predicted Intents: ['AskForRec']

Text: Can you help me find a great action movie?
Predicted Intents: ['AskForRec']

Text: I need a good movie to watch with my friends. Any ideas?
Predicted Intents: ['AskForRec']

Text: I'm in the mood for a comedy.
Predicted Intents: ['AskForRec']

Text: I loved 'Inception'. Any similar movies you can recommend?
Predicted Intents: ['AskForRec']

Text: What's a good family movie for us to watch together?
Predicted Intents: []

Text: I prefer movies with a strong female lead.
Predicted Intents: []

Text: Can you suggest a horror movie that isn't too scary?
Predicted Intents: ['AskForRec']

Text: I watched 'The Dark Knight' and it was amazing. Thanks!
Predicted Intents: ['GiveFeedback', 'OTH']

Text: The movie yo