In [35]:
""" Importing relevant packages """
import os # For finding pre-processed data
from pathlib import Path

import pandas as pd # For data handling
import numpy as np

import nltk #  For nlp processing
from sklearn.feature_extraction.text import TfidfVectorizer # For obtaining Tf-Idf tokenization

import gensim # For building and fine-tuning Word2Vec model
from gensim.models import Word2Vec
import gensim.downloader as api # Helpful for downloading pre-trained models

""" Loading the pre-processed data """

DirPpath = Path(os.path.abspath('')).parent # Fetching the current directory path
PledgesCsvPath = str(DirPpath.absolute()) + "\OutputFiles\PreprocessedData.csv" 

print("The current location of PreprocessedData.csv is: ", PledgesCsvPath)

PledgesDf = pd.read_csv(PledgesCsvPath, index_col=0) # Loading the preprocessed pledges into a dataframe

print(PledgesDf.head()) # Controlling the data loaded


""" Tokenize the pledges on words """

documents = [i for i in PledgesDf["PreProcessedText"]]
length = max([len(nltk.word_tokenize(i)) for i in documents])

The current location of PreprocessedData.csv is:  c:\Users\ecaudron001\Documents\GitHub\LLM-for-Tourism\Clustering\OutputFiles\PreprocessedData.csv
   Topic                                             Pledge  \
0      1  Actually we as an association are still pretty...   
1      1  EFFAT welcomes the Commission Proposal for a R...   
2      1  HOTREC calls for a level playing field and fai...   
3      1  Estonia sees the need to synchronize and harmo...   
4      1  Sphere Travel Club contributes to a flourishin...   

                                    PreProcessedText  
0  actually we as an association are still pretty...  
1  effat welcomes the commission proposal for a r...  
2  hotrec calls for a level playing field and fai...  
3  estonia sees the need to synchronize and harmo...  
4  sphere travel club contributes to a flourishin...  


In [6]:
import tensorflow
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import logging

import matplotlib.pyplot as plt

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#pre-trained model tokenizer

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

In [78]:
import torch

def DocEmbedding(tokenized_text):
    
    padded_tokens = tokenized_text + ['[PAD]' for _ in range(512-len(tokenized_text))]

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(padded_tokens)

    # Display the words with their indeces.
    for tup in zip(tokenized_text, indexed_tokens):
            print('{:<12} {:>6,}'.format(tup[0], tup[1]))

    #Attention mask
    attn_mask = [ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]
    
    # Mark each of the tokens as belonging to sentence "1".
    segments_ids = [1] * len(attn_mask)
    print (segments_ids)

    
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    attention_tensors = torch.tensor([attn_mask])

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

            outputs = model(tokens_tensor, attention_mask = attention_tensors, token_type_ids = segments_tensors)

            # Evaluating the model will return a different number of objects based on 
            # how it's  configured in the `from_pretrained` call earlier. In this case, 
            # becase we set `output_hidden_states = True`, the third item will be the 
            # hidden states from all layers. See the documentation for more details:
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = outputs[2]


            # Getting document vectors by averaging the second to last hidden layer of each token
            # `hidden_states` has shape [13 x 1 x 22 x 768]

            # `token_vecs` is a tensor with shape [22 x 768]
            token_vecs = hidden_states[-2][0]

            # Calculate the average of all 22 token vectors.
            document_embedding = torch.mean(token_vecs, dim=0)

    return document_embedding

In [104]:
import torch

pledgeEmbedding = []

for doc in documents:
       text = doc

        # Add the special tokens.
       marked_text = "[CLS] " + text + " [SEP]"
       
       # Split the sentence into tokens.
       tokenized_text = tokenizer.tokenize(marked_text)
       

       if len(tokenized_text) > 512:
              #print(len(tokenized_text))
              print("Too long")

              sentences = nltk.sent_tokenize(text)

              text1 = ""
              sentEmbedding = []

              for sent in sentences: 
                     
                     marked_text = "[CLS] " + text1 + sent + " [SEP]"
       
                     # Split the sentence into tokens.
                     tokenized_text = tokenizer.tokenize(marked_text)
                     
                     if len(tokenized_text) < 512:
                            text1 = text1 + sent
                     else:
                            marked_text = "[CLS] " + text1 + " [SEP]"       
                            # Split the sentence into tokens.
                            tokenized_text = tokenizer.tokenize(marked_text)
                            sentEmbedding.append(DocEmbedding(tokenized_text))                  
                            
                            text1 = ""
              
              document_embedding = torch.mean(torch.cat(tuple(sentEmbedding)).view(len(sentEmbedding),768), dim = 0)

       else:              
              document_embedding = DocEmbedding(tokenized_text)

       pledgeEmbedding.append(document_embedding.tolist())




[CLS]           101
actually      2,941
we            2,057
as            2,004
an            2,019
association   2,523
are           2,024
still         2,145
pretty        3,492
much          2,172
at            2,012
the           1,996
beginning     2,927
due           2,349
to            2,000
the           1,996
pan           6,090
##de          3,207
##mic         7,712
which         2,029
took          2,165
the           1,996
better        2,488
part          2,112
of            1,997
our           2,256
res          24,501
##so          6,499
##ur          3,126
##ces         9,623
.             1,012
what          2,054
we            2,057
want          2,215
to            2,000
provide       3,073
is            2,003
a             1,037
proper        5,372
guide         5,009
##line        4,179
for           2,005
st            2,358
##r           2,099
how           2,129
to            2,000
achieve       6,162
,             1,010
maintain      5,441
and           1,998


https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

In [106]:
DocIndexV1 = pd.DataFrame(pledgeEmbedding)# Outputting the indexed pledges file

IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV1.csv"
DocIndexV1.to_csv(IndexedPath)

In [105]:
len(pledgeEmbedding)

374

In [107]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [115]:
pledgeEmbedding = []

for doc in documents:
       text = doc

        # Add the special tokens.
       marked_text = "[CLS] " + text + " [SEP]"
       
       # Split the sentence into tokens.
       tokenized_text = tokenizer.tokenize(marked_text)
       

       if len(tokenized_text) > 512:
              #print(len(tokenized_text))
              print("Too long")

              sentences = nltk.sent_tokenize(text)

              text1 = ""
              sentEmbedding = []

              for sent in sentences: 
                     
                     marked_text = "[CLS] " + text1 + sent + " [SEP]"
       
                     # Split the sentence into tokens.
                     tokenized_text = tokenizer.tokenize(marked_text)
                     
                     if len(tokenized_text) < 512:
                            text1 = text1 + sent
                     else:
                            marked_text = text1       
                            sentEmbedding.append(torch.from_numpy(model.encode(marked_text)))                  
                            
                            text1 = ""
              
              document_embedding = torch.mean(torch.cat(tuple(sentEmbedding)).view(len(sentEmbedding),256), dim = 0)

       else:              
              document_embedding = model.encode(text)

       pledgeEmbedding.append(document_embedding.tolist())

Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long
Too long


In [113]:
model.encode("Hello world")

256

In [118]:
DocIndexV1 = pd.DataFrame(pledgeEmbedding)# Outputting the indexed pledges file

IndexedPath = str(DirPpath.absolute()) + "\OutputFiles\IndexedDataV1.csv"
DocIndexV1.to_csv(IndexedPath)