In [1]:
""" Importing relevant packages """
import os # For finding pre-processed data
from pathlib import Path

import pandas as pd # For data handling
import numpy as np

import nltk #  For nlp processing
from sklearn.feature_extraction.text import TfidfVectorizer # For obtaining Tf-Idf tokenization

import gensim # For building and fine-tuning Word2Vec model
from gensim.models import Word2Vec
import gensim.downloader as api # Helpful for downloading pre-trained models

""" Loading the pre-processed data """

DirPpath = Path(os.path.abspath('')).parent # Fetching the current directory path
PledgesCsvPath = str(DirPpath.absolute()) + "\OutputFiles\PreprocessedData.csv" 

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)

PledgesDf = pd.read_csv(PledgesCsvPath, index_col=0) # Loading the preprocessed pledges into a dataframe

print(PledgesDf.head()) # Controlling the data loaded


""" Tokenize the pledges on words """

documents = [i for i in PledgesDf["PreProcessedText"]]
length = max([len(nltk.word_tokenize(i)) for i in documents])

The current location of PreprocessedData.csv is:  c:\Users\ecaudron001\Documents\GitHub\LLM-for-Tourism\Clustering\OutputFiles\PreprocessedData.csv
   Topic                                             Pledge  \
0      1  Actually we as an association are still pretty...   
1      1  EFFAT welcomes the Commission Proposal for a R...   
2      1  HOTREC calls for a level playing field and fai...   
3      1  Estonia sees the need to synchronize and harmo...   
4      1  Sphere Travel Club contributes to a flourishin...   

                                    PreProcessedText  
0  actually we as an association are still pretty...  
1  effat welcomes the commission proposal for a r...  
2  hotrec calls for a level playing field and fai...  
3  estonia sees the need to synchronize and harmo...  
4  sphere travel club contributes to a flourishin...  


In [2]:
import tensorflow
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer, BertTokenizerFast, BertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging

import matplotlib.pyplot as plt

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = AutoTokenizer.from_pretrained("veroman/TourBERT",do_lowercase=True, truncation=True, padding="max_length",model_max_length=128)
tokenizer = BertTokenizerFast.from_pretrained('c:\\Users\\ecaudron001\\Documents\\GitHub\\LLM-for-Tourism\\Clustering\\2.1. WordEmbedding\\model\\', config=BertConfig())
tokenizer.model_max_length = 512
tokenizer.init_kwargs['model_max_length'] = 512

#pre-trained model tokenizer

In [4]:
# Load pre-trained model (weights)
# model = BertModel.from_pretrained('bert-base-uncased',
#                                   output_hidden_states = True, # Whether the model returns all hidden-states.
#                                   )
#model = AutoModel.from_pretrained("veroman/TourBERT")
model = AutoModel.from_pretrained(f'model/',
                                   output_hidden_states = True, # Whether the model returns all hidden-states.
                                   )


# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of BertModel were not initialized from the model checkpoint at model/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [5]:
import torch

def DocEmbedding(tokenized_text):
    
    padded_tokens = tokenized_text + ['[PAD]' for _ in range(512-len(tokenized_text))]

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(padded_tokens)

    # Display the words with their indeces.
    for tup in zip(tokenized_text, indexed_tokens):
            print('{:<12} {:>6,}'.format(tup[0], tup[1]))

    #Attention mask
    attn_mask = [ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]
    
    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(attn_mask)
    print (segments_ids)

    
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    attention_tensors = torch.tensor([attn_mask])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

            outputs = model(tokens_tensor, attention_mask = attention_tensors, token_type_ids = segments_tensors)

            # Evaluating the model will return a different number of objects based on 
            # how it's  configured in the `from_pretrained` call earlier. In this case, 
            # becase we set `output_hidden_states = True`, the third item will be the 
            # hidden states from all layers. See the documentation for more details:
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = outputs[2]


            # Getting document vectors by averaging the second to last hidden layer of each token
            # `hidden_states` has shape [13 x 1 x 22 x 768]

            # `token_vecs` is a tensor with shape [22 x 768]
            token_vecs = hidden_states[-2][0]

            # Calculate the average of all 22 token vectors.
            document_embedding = torch.mean(token_vecs, dim=0)
            #document_embedding = outputs[1]

    return document_embedding

In [6]:
import torch

pledgeEmbedding = []

for doc in documents:
       text = doc

        # Add the special tokens.
       marked_text = "[CLS] " + text + " [SEP]"
       
       # Split the sentence into tokens.
       tokenized_text = tokenizer.tokenize(marked_text)
       

       if len(tokenized_text) > 512:
              #print(len(tokenized_text))
              print("Too long")

              sentences = nltk.sent_tokenize(text)

              text1 = ""
              sentEmbedding = []

              for sent in sentences: 
                     
                     marked_text = "[CLS] " + text1 + sent + " [SEP]"
       
                     # Split the sentence into tokens.
                     tokenized_text = tokenizer.tokenize(marked_text)
                     
                     if len(tokenized_text) < 512:
                            text1 = text1 + sent
                     else:
                            marked_text = "[CLS] " + text1 + " [SEP]"       
                            # Split the sentence into tokens.
                            tokenized_text = tokenizer.tokenize(marked_text)
                            sentEmbedding.append(DocEmbedding(tokenized_text))
                            print(sentEmbedding)                  
                            
                            text1 = ""
              
              document_embedding = torch.mean(torch.cat(tuple(sentEmbedding)).view(len(sentEmbedding),768), dim = 0)
              print("Doc embedding")
              print(document_embedding)

       else:              
              document_embedding = DocEmbedding(tokenized_text)
              #document_embedding =torch.mean(torch.cat(tuple(document_embedding)).view(len(document_embedding),768), dim = 0) # For tourbert
              print("Doc embedding")
              print(document_embedding)


       pledgeEmbedding.append(document_embedding.tolist())




[CLS]             2
actually      6,795
we              597
as              176
an              213
association   3,365
are             216
still         2,246
pret         13,387
##ty          1,206
much          2,517
at              278
the             114
beginning     7,168
due           1,449
to              128
the             114
pandemic      1,222
which           418
took          5,340
the             114
better        1,109
part            425
of              125
our           1,164
ressources    9,440
.                18
what          1,636
we              597
want          4,697
to              128
provide         777
is              204
a                43
proper        2,408
guideline    20,423
for             146
str             320
how             600
to              128
achieve       1,478
,                16
maintain      2,976
and             122
develop         345
a                43
sustainable     441
business        467
.                18
additionally  5,022


Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


Doc embedding
tensor([-2.1550e-01,  1.3743e-01, -7.9062e-01, -2.1341e-01,  1.5048e+00,
        -3.6407e-01, -1.6008e+00,  9.3511e-02, -5.4953e-01,  9.4383e-02,
        -5.3042e-01, -8.3432e-02,  7.6907e-02, -1.7793e-01,  2.2078e-01,
         1.1948e+00,  9.3693e-02,  4.7846e-01, -6.8313e-01,  3.1196e-01,
         7.5232e-01, -6.6909e-01,  1.7318e-03, -1.6810e+00,  2.5676e-01,
         5.4668e-01, -3.4830e-01, -1.7489e-01, -1.3195e-01,  1.3075e-01,
        -3.6680e-02,  1.2155e+00, -1.1881e-01,  1.5849e-01, -1.2448e+00,
        -2.0284e-01, -1.7046e+00,  1.4456e+00,  5.4782e-02,  3.5956e-01,
         7.6276e-01, -2.8520e-01, -3.2197e-01, -1.1183e+00,  7.6410e-01,
        -2.0910e+00, -1.0633e+00, -1.2839e+00, -1.3242e+00, -3.5295e-01,
         3.4399e-01,  9.9981e-01,  3.6821e-01, -1.1252e+00, -1.3495e-01,
        -1.8182e-01,  5.2518e-01,  3.9303e-01, -1.0080e+00,  9.2909e-01,
        -9.7042e-01, -4.5701e-01,  6.4220e-01, -1.8043e-03, -1.3361e-01,
        -1.2415e-02,  7.4299e-02,  1.

https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

In [7]:
DocIndexV1 = pd.DataFrame(pledgeEmbedding)# Outputting the indexed pledges file

#IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV1.csv"
#IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV2.csv"
IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV3.csv"
DocIndexV1.to_csv(IndexedPath)

In [56]:
len(pledgeEmbedding[1])

768

In [107]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [115]:
pledgeEmbedding = []

for doc in documents:
       text = doc

        # Add the special tokens.
       marked_text = "[CLS] " + text + " [SEP]"
       
       # Split the sentence into tokens.
       tokenized_text = tokenizer.tokenize(marked_text)
       

       if len(tokenized_text) > 512:
              #print(len(tokenized_text))
              print("Too long")

              sentences = nltk.sent_tokenize(text)

              text1 = ""
              sentEmbedding = []

              for sent in sentences: 
                     
                     marked_text = "[CLS] " + text1 + sent + " [SEP]"
       
                     # Split the sentence into tokens.
                     tokenized_text = tokenizer.tokenize(marked_text)
                     
                     if len(tokenized_text) < 512:
                            text1 = text1 + sent
                     else:
                            marked_text = text1       
                            sentEmbedding.append(torch.from_numpy(model.encode(marked_text)))                  
                            
                            text1 = ""
              
              document_embedding = torch.mean(torch.cat(tuple(sentEmbedding)).view(len(sentEmbedding),256), dim = 0)

       else:              
              document_embedding = model.encode(text)

       pledgeEmbedding.append(document_embedding.tolist())

Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long


In [113]:
model.encode("Hello world")

256

In [118]:
DocIndexV1 = pd.DataFrame(pledgeEmbedding)# Outputting the indexed pledges file

IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV1.csv"
DocIndexV1.to_csv(IndexedPath)

In [169]:
""" Loading the pre-processed data """

DirPpath = Path(os.path.abspath('')).parent # Fetching the current directory path
PledgesCsvPath = str(DirPpath.absolute()) + "\OutputFiles\Clusters.xlsx" 

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)

PledgesDf = pd.read_excel(PledgesCsvPath, index_col=0) # Loading the preprocessed pledges into a dataframe

print(PledgesDf.head()) # Controlling the data loaded


PledgesCsvPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV1.csv" 

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)
data = pd.read_csv(PledgesCsvPath, index_col=0)

data["Prediction"] = PledgesDf["Cluster"]



The current location of PreprocessedData.csv is:  c:\Users\ecaudron001\Documents\GitHub\LLM-for-Tourism\Clustering\OutputFiles\Clusters.xlsx
                                              Pledge  \
0  Actually we as an association are still pretty...   
1  EFFAT welcomes the Commission Proposal for a R...   
2  HOTREC calls for a level playing field and fai...   
3  Estonia sees the need to synchronize and harmo...   
4  Sphere Travel Club contributes to a flourishin...   

                                    PreProcessedText  Cluster  Topics  \
0  actually we as an association are still pretty...        5       1   
1  effat welcomes the commission proposal for a r...        2       1   
2  hotrec calls for a level playing field and fai...        2       1   
3  estonia sees the need to synchronize and harmo...        3       1   
4  sphere travel club contributes to a flourishin...        5       1   

                  Area  
0  Policy & regulation  
1  Policy & regulation  
2  Polic

In [170]:
# -*- coding: utf-8 -*-
__author__ = "Joaquim Viegas"

""" JQM_CV - Python implementations of Dunn and Davis Bouldin clustering validity indices

dunn(k_list):
    Slow implementation of Dunn index that depends on numpy
    -- basec.pyx Cython implementation is much faster but flower than dunn_fast()
dunn_fast(points, labels):
    Fast implementation of Dunn index that depends on numpy and sklearn.pairwise
    -- No Cython implementation
davisbouldin(k_list, k_centers):
    Implementation of Davis Boulding index that depends on numpy
    -- basec.pyx Cython implementation is much faster
"""

import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def delta(ck, cl):
    values = np.ones([len(ck), len(cl)])*10000
    
    for i in range(0, len(ck)):
        for j in range(0, len(cl)):
            values[i, j] = np.linalg.norm(ck[i]-cl[j])
            if values[i,j] == 0:
                print(i)
                print(j)
            
    return np.max(values)
    
def big_delta(ci):
    values = np.zeros([len(ci), len(ci)])
    
    for i in range(0, len(ci)):
        for j in range(0, len(ci)):
            values[i, j] = np.linalg.norm(ci[i]-ci[j])
            
    return np.max(values)
    
def dunn(k_list):
    """ Dunn index [CVI]
    
    Parameters
    ----------
    k_list : list of np.arrays
        A list containing a numpy array for each cluster |c| = number of clusters
        c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
    """
    deltas = np.ones([len(k_list), len(k_list)])*1000000
    big_deltas = np.zeros([len(k_list), 1])
    l_range = list(range(0, len(k_list)))
    
    for k in l_range:
        for l in (l_range[0:k]+l_range[k+1:]):
            deltas[k, l] = delta(k_list[k], k_list[l])
            #print(deltas[k, l])
        
        big_deltas[k] = big_delta(k_list[k])

    di = np.min(deltas)/np.max(big_deltas)
    return di

def delta_fast(ck, cl, distances):
    values = distances[np.where(ck)][:, np.where(cl)]
    values = values[np.nonzero(values)]

    return np.min(values)
    
def big_delta_fast(ci, distances):
    values = distances[np.where(ci)][:, np.where(ci)]
    #values = values[np.nonzero(values)]
            
    return np.max(values)

def dunn_fast(points, labels):
    """ Dunn index - FAST (using sklearn pairwise euclidean_distance function)
    
    Parameters
    ----------
    points : np.array
        np.array([N, p]) of all points
    labels: np.array
        np.array([N]) labels of all points
    """
    distances = euclidean_distances(points)
    ks = np.sort(np.unique(labels))
    
    deltas = np.ones([len(ks), len(ks)])*1000000
    big_deltas = np.zeros([len(ks), 1])
    
    l_range = list(range(0, len(ks)))
    
    for k in l_range:
        for l in (l_range[0:k]+l_range[k+1:]):
            deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances)
        
        big_deltas[k] = big_delta_fast((labels == ks[k]), distances)

    di = np.min(deltas)/np.max(big_deltas)
    return di
    
    
def  big_s(x, center):
    len_x = len(x)
    total = 0
        
    for i in range(len_x):
        total += np.linalg.norm(x[i]-center)    
    
    return total/len_x

def davisbouldin(k_list, k_centers):
    """ Davis Bouldin Index
    
    Parameters
    ----------
    k_list : list of np.arrays
        A list containing a numpy array for each cluster |c| = number of clusters
        c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
    k_centers : np.array
        The array of the cluster centers (prototypes) of type np.array([K, p])
    """
    len_k_list = len(k_list)
    big_ss = np.zeros([len_k_list], dtype=np.float64)
    d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64)
    db = 0    

    for k in range(len_k_list):
        big_ss[k] = big_s(k_list[k], k_centers[k])

    for k in range(len_k_list):
        for l in range(0, len_k_list):
            d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])

    for k in range(len_k_list):
        values = np.zeros([len_k_list-1], dtype=np.float64)
        for l in range(0, k):
            values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
        for l in range(k+1, len_k_list):
            values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]

        db += np.max(values)
    res = db/len_k_list
    return res

In [171]:
C1 = data.loc[data["Prediction"] == 1]
C2 = data.loc[data["Prediction"] == 2]
C3 = data.loc[data["Prediction"] == 3]
C4 = data.loc[data["Prediction"] == 4]
C5 = data.loc[data["Prediction"] == 5]
C6 = data.loc[data["Prediction"] == 6]

cluster_list = [C1.drop(["Prediction"], axis = 1).to_numpy(), C2.drop(["Prediction"], axis = 1).to_numpy(), C3.drop(["Prediction"], axis = 1).to_numpy(), C4.drop(["Prediction"], axis = 1).to_numpy(), 
                C5.drop(["Prediction"], axis = 1).to_numpy(), C6.drop(["Prediction"], axis = 1).to_numpy()]

In [172]:
delta(cluster_list[0], cluster_list[4])

25
2


2.406446431212579

In [173]:
dunn(cluster_list)

25
2
2
25


0.8787975901064272

0.95065 --> BERT
0.8789 --> SBERT
0.7734 --> Word2vec

In [181]:
""" Loading the pre-processed data """

DirPpath = Path(os.path.abspath('')).parent # Fetching the current directory path
PledgesCsvPath =  "c:\\Users\\ecaudron001\\Documents\\GitHub\\semic_pledges\\OutputFiles\\Clusters.xlsx"

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)

PledgesDf = pd.read_excel(PledgesCsvPath, index_col=0) # Loading the preprocessed pledges into a dataframe

print(PledgesDf.head()) # Controlling the data loaded


PledgesCsvPath = "c:\\Users\\ecaudron001\\Documents\\GitHub\\semic_pledges\\OutputFiles\\IndexedDataV1.csv"

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)
data = pd.read_csv(PledgesCsvPath, index_col=0)

data["Prediction"] = PledgesDf["Cluster"]

The current location of PreprocessedData.csv is:  c:\Users\ecaudron001\Documents\GitHub\semic_pledges\OutputFiles\Clusters.xlsx
                                              Pledge  \
0  Actually we as an association are still pretty...   
1  EFFAT welcomes the Commission Proposal for a R...   
2  HOTREC calls for a level playing field and fai...   
3  Estonia sees the need to synchronize and harmo...   
4  Sphere Travel Club contributes to a flourishin...   

                                    PreProcessedText  Cluster  Topics  \
0  actually association still pretty much begin d...        5       1   
1  effat welcome commission proposal regulation d...        5       1   
2  hotrec call level play field fair competition ...        4       1   
3  estonia see need synchronize harmonize rule sh...        5       1   
4  sphere travel club contribute flourish transpa...        5       1   

                  Area  
0  Policy & regulation  
1  Policy & regulation  
2  Policy & regulatio

In [182]:
C1 = data.loc[data["Prediction"] == 1]
C2 = data.loc[data["Prediction"] == 2]
C3 = data.loc[data["Prediction"] == 3]
C4 = data.loc[data["Prediction"] == 4]
C5 = data.loc[data["Prediction"] == 5]
C6 = data.loc[data["Prediction"] == 6]

cluster_list = [C1.drop(["Prediction"], axis = 1).to_numpy(), C2.drop(["Prediction"], axis = 1).to_numpy(), C3.drop(["Prediction"], axis = 1).to_numpy(), C4.drop(["Prediction"], axis = 1).to_numpy(), 
                C5.drop(["Prediction"], axis = 1).to_numpy(), C6.drop(["Prediction"], axis = 1).to_numpy()]

In [183]:
dunn(cluster_list)

5
28
28
5


0.773401871142165