# Mount Google - Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Inspecting the dataset

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/train_set.tsv', sep='\t')

# Display the first few rows of the dataset to understand its structure
print(data.head())


                      dialog_id  utt_id      speaker  turn_id  \
0  20191127-210600_875_live.pkl       1  RECOMMENDER        1   
1  20191127-210600_875_live.pkl       2  RECOMMENDER        1   
2  20191127-210600_875_live.pkl       3       SEEKER        1   
3  20191127-210600_875_live.pkl       4       SEEKER        1   
4  20191127-210600_875_live.pkl       5  RECOMMENDER        2   

                                                text  \
0                                          Hi There!   
1         What types of movies do you like to watch?   
2                                             Hello!   
3  I'm more of an action movie or a good romance ...   
4  I just saw the trailer for Knives Out when I w...   

                               text_with_placeholder             movies  \
0                                          Hi There!                NaN   
1         What types of movies do you like to watch?                NaN   
2                                             H

# Cleaning and Bio - Tagging the updated dataset

In [None]:
import pandas as pd
import re
import ast

# Load the updated dataset
data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/updated_train_set.tsv', sep='\t')

# Preprocess the text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s\[\]_]', '', text)
    return text

data['cleaned_text'] = data['text'].fillna('').apply(preprocess_text)
data['cleaned_text_with_placeholder'] = data['text_with_placeholder'].fillna('').apply(preprocess_text)

# Function to safely parse dictionary strings
def safe_literal_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return {}

# Function to replace placeholders with actual values
def replace_placeholders(row):
    text = row['cleaned_text_with_placeholder']

    movie_dict = safe_literal_eval(row['movie_dict']) if pd.notna(row['movie_dict']) else {}
    genre_dict = safe_literal_eval(row['genre_dict']) if pd.notna(row['genre_dict']) else {}
    actor_dict = safe_literal_eval(row['actor_dict']) if pd.notna(row['actor_dict']) else {}
    director_dict = safe_literal_eval(row['director_dict']) if pd.notna(row['director_dict']) else {}

    # Replace movie titles
    for placeholder, actual in movie_dict.items():
        text = text.replace(f'[movie_title_{actual}]', placeholder)
    # Replace genres
    for placeholder, actual in genre_dict.items():
        text = text.replace(f'[movie_genre_{actual}]', placeholder)
    # Replace actors
    for placeholder, actual in actor_dict.items():
        text = text.replace(f'[movie_p_actor_{actual}]', placeholder)
    # Replace directors
    for placeholder, actual in director_dict.items():
        text = text.replace(f'[movie_p_director_{actual}]', placeholder)

    return text, actor_dict, director_dict

# Apply placeholder replacement and extract dictionaries
data[['replaced_text', 'actor_dict', 'director_dict']] = data.apply(lambda row: pd.Series(replace_placeholders(row)), axis=1)

# Function to convert text with replaced placeholders to BIO format
def convert_to_bio(text, movies, genres, people_names, actor_dict, director_dict):
    words = text.split()
    tags = ['O'] * len(words)

    def apply_bio_tags(entity, tag_prefix):
        entity_words = entity.split()
        for i in range(len(words)):
            if words[i:i+len(entity_words)] == entity_words:
                tags[i] = f'B-{tag_prefix}'
                for j in range(1, len(entity_words)):
                    tags[i+j] = f'I-{tag_prefix}'

    # Apply BIO tags for each entity type
    if pd.notna(movies):
        for movie in movies.split(';'):
            movie = movie.strip()
            apply_bio_tags(movie, 'MOVIE')

    if pd.notna(genres):
        for genre in genres.split(';'):
            genre = genre.strip()
            apply_bio_tags(genre, 'GENRE')

    if pd.notna(people_names):
        for person in people_names.split(';'):
            person = person.strip()
            if person in actor_dict:
                apply_bio_tags(person, 'ACTOR')
            elif person in director_dict:
                apply_bio_tags(person, 'DIRECTOR')
            else:
                apply_bio_tags(person, 'PERSON')

    return list(zip(words, tags))

# Apply the function to the dataset
data['bio_tags'] = data.apply(lambda row: convert_to_bio(row['replaced_text'], row['movies'], row['genres'], row['people_names'], row['actor_dict'], row['director_dict']), axis=1)

# Display the first few rows with BIO tags
print(data[['replaced_text', 'bio_tags']].head())

# Save the data to a CSV file
data[['replaced_text', 'bio_tags']].to_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/prepared_data_revised_new.csv', index=False)


  data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/updated_train_set.tsv', sep='\t')


                                       replaced_text  \
0                                           hi there   
1          what types of movies do you like to watch   
2                                              hello   
3  im more of an action movie or a good romance a...   
4  i just saw the trailer for Knives Out (2019) w...   

                                            bio_tags  
0                              [(hi, O), (there, O)]  
1  [(what, O), (types, O), (of, O), (movies, O), ...  
2                                       [(hello, O)]  
3  [(im, O), (more, O), (of, O), (an, O), (action...  
4  [(i, O), (just, O), (saw, O), (the, O), (trail...  


# Preparation of the training dataset

In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import ast

# Load the prepared training data
train_data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/prepared_data_revised_new.csv')

# Prepare the sentences and labels
def prepare_data(data):
    sentences = []
    labels = []
    for _, row in data.iterrows():
        try:
            words, tags = zip(*ast.literal_eval(row['bio_tags']))  # Using ast.literal_eval instead of eval for safety
            sentences.append(list(words))
            labels.append(list(tags))
        except ValueError:
            print(f"Skipping row {row.name} due to incorrect formatting in bio_tags")
            continue
    return sentences, labels

train_sentences, train_labels = prepare_data(train_data)

# Check the lengths of sentences and labels
print(f"Number of sentences: {len(train_sentences)}")
print(f"Number of labels: {len(train_labels)}")

# Create a vocabulary and tag index
words = list(set([word for sentence in train_sentences for word in sentence]))
words.append("ENDPAD")
n_words = len(words)
tags = list(set(tag for label in train_labels for tag in label))
n_tags = len(tags)

# Create word and tag indices
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

# Convert the sentences and labels to sequences
X_train = [[word2idx[w] for w in s] for s in train_sentences]
y_train = [[tag2idx[t] for t in s] for s in train_labels]

# Pad the sequences
max_len = 50
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')

# Convert labels to categorical
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

print("Data preparation is complete.")


Skipping row 24 due to incorrect formatting in bio_tags
Skipping row 25 due to incorrect formatting in bio_tags
Skipping row 715 due to incorrect formatting in bio_tags
Skipping row 1084 due to incorrect formatting in bio_tags
Skipping row 1956 due to incorrect formatting in bio_tags
Skipping row 1957 due to incorrect formatting in bio_tags
Skipping row 2135 due to incorrect formatting in bio_tags
Skipping row 3000 due to incorrect formatting in bio_tags
Skipping row 4145 due to incorrect formatting in bio_tags
Skipping row 4146 due to incorrect formatting in bio_tags
Skipping row 4154 due to incorrect formatting in bio_tags
Skipping row 4155 due to incorrect formatting in bio_tags
Skipping row 4156 due to incorrect formatting in bio_tags
Skipping row 4183 due to incorrect formatting in bio_tags
Skipping row 6024 due to incorrect formatting in bio_tags
Skipping row 7174 due to incorrect formatting in bio_tags
Skipping row 7177 due to incorrect formatting in bio_tags
Skipping row 7665 d

# Preparation of testing dataset

In [None]:
import pandas as pd
import re
import ast

# Load the updated dataset
test_data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/test_set.tsv', sep='\t')

# Preprocess the text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s\[\]_]', '', text)
    return text

test_data['cleaned_text'] = test_data['text'].fillna('').apply(preprocess_text)
test_data['cleaned_text_with_placeholder'] = test_data['text_with_placeholder'].fillna('').apply(preprocess_text)

# Function to safely parse dictionary strings
def safe_literal_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return {}

# Function to replace placeholders with actual values
def replace_placeholders(row):
    text = row['cleaned_text_with_placeholder']

    movie_dict = safe_literal_eval(row['movie_dict']) if pd.notna(row['movie_dict']) else {}
    genre_dict = safe_literal_eval(row['genre_dict']) if pd.notna(row['genre_dict']) else {}
    actor_dict = safe_literal_eval(row['actor_dict']) if pd.notna(row['actor_dict']) else {}
    director_dict = safe_literal_eval(row['director_dict']) if pd.notna(row['director_dict']) else {}

    # Replace movie titles
    for placeholder, actual in movie_dict.items():
        text = text.replace(f'[movie_title_{actual}]', placeholder)
    # Replace genres
    for placeholder, actual in genre_dict.items():
        text = text.replace(f'[movie_genre_{actual}]', placeholder)
    # Replace actors
    for placeholder, actual in actor_dict.items():
        text = text.replace(f'[movie_p_actor_{actual}]', placeholder)
    # Replace directors
    for placeholder, actual in director_dict.items():
        text = text.replace(f'[movie_p_director_{actual}]', placeholder)

    return text, actor_dict, director_dict

# Apply placeholder replacement and extract dictionaries
test_data[['replaced_text', 'actor_dict', 'director_dict']] = test_data.apply(lambda row: pd.Series(replace_placeholders(row)), axis=1)

# Function to convert text with replaced placeholders to BIO format
def convert_to_bio(text, movies, genres, people_names, actor_dict, director_dict):
    words = text.split()
    tags = ['O'] * len(words)

    def apply_bio_tags(entity, tag_prefix):
        entity_words = entity.split()
        for i in range(len(words)):
            if words[i:i+len(entity_words)] == entity_words:
                tags[i] = f'B-{tag_prefix}'
                for j in range(1, len(entity_words)):
                    tags[i+j] = f'I-{tag_prefix}'

    # Apply BIO tags for each entity type
    if pd.notna(movies):
        for movie in movies.split(';'):
            movie = movie.strip()
            apply_bio_tags(movie, 'MOVIE')

    if pd.notna(genres):
        for genre in genres.split(';'):
            genre = genre.strip()
            apply_bio_tags(genre, 'GENRE')

    if pd.notna(people_names):
        for person in people_names.split(';'):
            person = person.strip()
            if person in actor_dict:
                apply_bio_tags(person, 'ACTOR')
            elif person in director_dict:
                apply_bio_tags(person, 'DIRECTOR')
            else:
                apply_bio_tags(person, 'PERSON')

    return list(zip(words, tags))

# Apply the function to the dataset
test_data['bio_tags'] = test_data.apply(lambda row: convert_to_bio(row['replaced_text'], row['movies'], row['genres'], row['people_names'], row['actor_dict'], row['director_dict']), axis=1)

# Display the first few rows with BIO tags
print(test_data[['replaced_text', 'bio_tags']].head())

# Save the data to a CSV file
test_data[['replaced_text', 'bio_tags']].to_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/test_prepared_data_revised_new.csv', index=False)


                       replaced_text  \
0                                 hi   
1  im here to help you chose a movie   
2                           terrific   
3      what are some genres you like   
4    what was the last movie you saw   

                                            bio_tags  
0                                          [(hi, O)]  
1  [(im, O), (here, O), (to, O), (help, O), (you,...  
2                                    [(terrific, O)]  
3  [(what, O), (are, O), (some, O), (genres, O), ...  
4  [(what, O), (was, O), (the, O), (last, O), (mo...  


In [None]:
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import ast

# Load the prepared test dataset
test_data = pd.read_csv('/content/drive/MyDrive/CRS/Dataset/INSPIRED2/Dataset/tsv/test_prepared_data_revised_new.csv')

# Function to safely parse BIO tags
def parse_bio_tags(bio_tags):
    try:
        return ast.literal_eval(bio_tags)
    except (ValueError, SyntaxError):
        return []

# Parse the BIO tags
test_data['bio_tags'] = test_data['bio_tags'].apply(parse_bio_tags)


In [None]:
# Function to prepare data for model input
def prepare_data(data):
    sentences = []
    labels = []
    for _, row in data.iterrows():
        try:
            words, tags = zip(*row['bio_tags'])
            sentences.append(list(words))
            labels.append(list(tags))
        except ValueError:
            print(f"Skipping row {row.name} due to incorrect formatting in bio_tags: {row['bio_tags']}")
            continue
    return sentences, labels

# Prepare the test sentences and labels
test_sentences, test_labels = prepare_data(test_data)

# Assuming you have word2idx and tag2idx from your training phase
# Convert the sentences and labels to sequences
X_test = [[word2idx.get(w, word2idx["ENDPAD"]) for w in s] for s in test_sentences]
y_test = [[tag2idx.get(t, tag2idx["O"]) for t in s] for s in test_labels]

# Pad the sequences
max_len = 50  # Replace with your model's max_len
n_tags = len(tag2idx)  # Replace with the number of tags
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
y_test = pad_sequences(y_test, maxlen=max_len, padding='post')

# Convert labels to categorical
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]


Skipping row 600 due to incorrect formatting in bio_tags: []
Skipping row 601 due to incorrect formatting in bio_tags: []
Skipping row 662 due to incorrect formatting in bio_tags: []
Skipping row 920 due to incorrect formatting in bio_tags: []
Skipping row 1108 due to incorrect formatting in bio_tags: []
Skipping row 1243 due to incorrect formatting in bio_tags: []
Skipping row 1244 due to incorrect formatting in bio_tags: []
Skipping row 1551 due to incorrect formatting in bio_tags: []
Skipping row 1552 due to incorrect formatting in bio_tags: []
Skipping row 2244 due to incorrect formatting in bio_tags: []
Skipping row 2650 due to incorrect formatting in bio_tags: []
Skipping row 3111 due to incorrect formatting in bio_tags: []


In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
import os
os._exit(00)

In [None]:
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
from torch.utils.data import Dataset

# Define constants
MAX_LEN = 50
BATCH_SIZE = 16

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=n_tags)

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(sentence, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt', is_split_into_words=True)
        input_ids = encoding['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)  # Remove batch dimension

        # Adjust labels to max_length
        label_ids = [tag2idx.get(label, 0) for label in labels]  # Convert labels to integer indices
        label_ids = label_ids + [0] * (self.max_len - len(label_ids))  # Pad labels
        label_ids = label_ids[:self.max_len]

        # Convert labels to tensor
        labels = torch.tensor(label_ids, dtype=torch.long)

        # Return a dictionary with the required keys
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Create datasets
train_dataset = NERDataset(train_sentences, train_labels, tokenizer, MAX_LEN)
test_dataset = NERDataset(test_sentences, test_labels, tokenizer, MAX_LEN)

# Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.2691
1000,0.0222
1500,0.0174
2000,0.0142
2500,0.0114
3000,0.0114
3500,0.0099
4000,0.0091
4500,0.0072
5000,0.0076


TrainOutput(global_step=8950, training_loss=0.02319136634219292, metrics={'train_runtime': 1947.51, 'train_samples_per_second': 73.494, 'train_steps_per_second': 4.596, 'total_flos': 3652582805001000.0, 'train_loss': 0.02319136634219292, 'epoch': 5.0})

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.011763021349906921, 'eval_runtime': 12.6466, 'eval_samples_per_second': 270.902, 'eval_steps_per_second': 17.001, 'epoch': 5.0}


In [1]:
import torch

def preprocess_sentence(sentence, tokenizer, max_len):
    # Tokenize and pad the sentence, convert to tensors
    encoding = tokenizer(sentence, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
    input_ids = encoding['input_ids'].squeeze(0)
    attention_mask = encoding['attention_mask'].squeeze(0)
    return input_ids, attention_mask

def predict_entities(sentence, tokenizer, model, max_len):
    # Move model to the same device as the input
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Prepare input tensors
    input_ids, attention_mask = preprocess_sentence(sentence, tokenizer, max_len)

    # Move tensors to the same device as the model
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        model.eval()
        outputs = model(input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        logits = outputs.logits

    # Move logits back to CPU
    logits = logits.cpu()

    predicted_labels = torch.argmax(logits, dim=2).squeeze().tolist()
    return predicted_labels

def decode_predictions(predicted_labels, tokenizer, sentence, tag2idx, id2tag):
    # Tokenize the sentence again to get the correct tokens
    encoding = tokenizer(sentence, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'].squeeze().tolist())

    # Remove special tokens
    tokens = [token for token in tokens if token not in tokenizer.all_special_tokens]

    # Decode labels
    decoded_labels = [id2tag.get(label, 'O') for label in predicted_labels]
    decoded_labels = [label for token, label in zip(tokens, decoded_labels) if token not in tokenizer.all_special_tokens]

    # Format output
    entities = []
    current_entity = []
    current_label = None

    for token, label in zip(tokens, decoded_labels):
        if label.startswith('B-'):
            if current_entity:
                entities.append((' '.join(current_entity), current_label))
            current_entity = [token]
            current_label = label[2:]
        elif label.startswith('I-') and current_label and label[2:] == current_label:
            current_entity.append(token)
        else:
            if current_entity:
                entities.append((' '.join(current_entity), current_label))
                current_entity = []
                current_label = None

    if current_entity:
        entities.append((' '.join(current_entity), current_label))

    return entities

# Example usage
sentence = "I watched an amazing science fiction movie called Dune last night."
predicted_labels = predict_entities(sentence, tokenizer, model, MAX_LEN)
id2tag = {v: k for k, v in tag2idx.items()}
decoded_predictions = decode_predictions(predicted_labels, tokenizer, sentence, id2tag, id2tag)

print("Sentence:", sentence)
print("Entities:", decoded_predictions)


NameError: name 'tokenizer' is not defined

In [None]:
import pickle

# Save the models dictionary to a PKL file
with open('/content/drive/MyDrive/CRS/Saved models/Entity_Extraction.pkl', 'wb') as f:
    pickle.dump(model, f)
