<a href="https://colab.research.google.com/github/XavierXinchi/Slot-Extraction-from-MultiWOZ/blob/main/cluster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/FYP')

In [None]:
!pip install tqdm
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install nltk
!pip install keybert

In [None]:
from collections import defaultdict, Counter
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import csv
import os
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from keybert import KeyBERT
import re

batch_size = 16;
# Loading pre-trained BERT models using the huggingface library
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# read tsv file
def read_tsv(file_name):
    data = []
    with open(file_name, 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        for row in reader:
            data.append(row)
    return data

# create dictionary
def create_dict(data):
    domain_dict = defaultdict(list)
    for row in data:
        dialogue_id, speaker, utterance, domain_list = row
        domain_list = domain_list.strip('][').split(
            ', ')  # convert string to list
        if speaker == 'USER':
            for domain in domain_list:
                cleaned_domain = domain.strip("'")  # remove single quotes
                domain_dict[cleaned_domain].append(utterance)
    return domain_dict
  
def extract_key_phrases(embeddings, utterances, num_keywords=2, batch_size=16):
    key_model = KeyBERT('distilbert-base-nli-mean-tokens')
    key_phrases = []
    num_utterances = len(utterances)
    
    # Create batches of utterances
    utterance_batches = [utterances[i:i + batch_size] for i in range(0, num_utterances, batch_size)]
    
    for batch in tqdm(utterance_batches, desc="Extracting key phrases"):
        for utterance in batch:
            keywords = key_model.extract_keywords(
                utterance, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=num_keywords)
            key_phrases.append([kw[0] for kw in keywords])
    
    return key_phrases

def generate_slot_names(clustered_domains, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for domain, clusters in clustered_domains.items():
            slot_names = []
            for cluster_label, cluster_info in clusters.items():
                # Extract key phrases for each cluster
                embeddings = cluster_info['embeddings']
                utterances = cluster_info['utterances']
                key_phrases = extract_key_phrases(embeddings, utterances)

                # Flatten the list of key phrases
                flat_key_phrases = [phrase for phrases in key_phrases for phrase in phrases]

                # Count and sort the key phrases
                phrase_count = Counter(flat_key_phrases)
                sorted_phrases = sorted(phrase_count.items(), key=lambda x: x[1], reverse=True)

                # Remove the most frequent words
                slot_name = ' '.join([phrase[0] for phrase in sorted_phrases[1:]])
                slot_names.append(slot_name)

            f.write(f"{domain}\t{' '.join(slot_names)}\n")

def cluster_utterances(domain_dict):
    clustered_domains = {}

    # 1. Check if there are CUDA devices available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 2. Move the model to GPU (if available)
    model.to(device)
    
    for domain, utterances in tqdm(domain_dict.items(), desc="Clustering domains"):
        # Create batches of encoded utterances
        batches = []
        for i in range(0, len(utterances), batch_size):
            batch = utterances[i:i + batch_size]
            encoded_batch = tokenizer(
                batch, padding=True, truncation=True, return_tensors="pt")
            batches.append(encoded_batch)

        embeddings = []
        for encoded_utterances in batches:
            # Move input data to GPU (if available)
            encoded_utterances = {key: value.to(
                device) for key, value in encoded_utterances.items()}

            with torch.no_grad():
                batch_embeddings = model(
                    **encoded_utterances).last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.extend(batch_embeddings)

        embeddings = np.array(embeddings)

        n_clusters = 2
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(embeddings)

        clustered_domains[domain] = defaultdict(
            lambda: {'utterances': [], 'embeddings': []})

        for i, label in enumerate(labels):
            clustered_domains[domain][label]['utterances'].append(
                utterances[i])
            clustered_domains[domain][label]['embeddings'].append(
                embeddings[i])

        # Compute word distribution for each cluster in each domain
        for cluster_label, cluster_info in clustered_domains[domain].items():
            word_count = defaultdict(int)
            total_words = 0
            cluster_utterances = cluster_info['utterances']

            for utterance in cluster_utterances:
                # Extract words without punctuation and convert to lowercase
                words = [word.lower() for word in re.findall(r'\b[a-zA-Z]+\b', utterance)]
                for word in words:
                    if word.lower() not in stop_words:  # Only count non-stopwords
                        word_count[word] += 1
                        total_words += 1

            # Sort word distribution by frequency in descending order
            sorted_word_distribution = {
                word: count / total_words for word, count in sorted(word_count.items(), key=lambda x: x[1], reverse=True)
            }
            clustered_domains[domain][cluster_label]['word_distribution'] = sorted_word_distribution
            clustered_domains[domain][cluster_label]['total_words'] = total_words  # Store total words in the cluster

    return clustered_domains


def main():
    file_name = 'FYP/MyDrive/Colab Notebooks/FYP/all_dialogues_utterances.tsv'
    data = read_tsv(file_name)
    domain_dict = create_dict(data)
    clustered_domains = cluster_utterances(domain_dict)
    output_file = "slot_names.tsv"
    generate_slot_names(clustered_domains, output_file)

    # Print Clustering Results
    for domain, clusters in clustered_domains.items():
        print(f"Domain: {domain}")
        for cluster_label, cluster_info in clusters.items():
            print(f"\tCluster {cluster_label}:")
            print(f"\tTotal Words: {cluster_info['total_words']}")  # Print total words in the cluster
            print("\tUtterances:")
            for utterance in cluster_info['utterances']:
                print(f"\t\t{utterance}")
            print("\tWord Distribution:")
            for word, distribution in cluster_info['word_distribution'].items():
                print(f"\t\t{word}: {distribution}")
    

if __name__ == '__main__':
    main()