In [1]:
import torch
import torch.nn as nn
import torch
from transformers import BertTokenizer

In [2]:
import tensorflow as tf

class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_size):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size

        self.query = tf.keras.layers.Dense(embed_size)
        self.key = tf.keras.layers.Dense(embed_size)
        self.value = tf.keras.layers.Dense(embed_size)

        self.scale = tf.sqrt(tf.cast(embed_size, dtype=tf.float32))

    def call(self, values, keys, query, mask):
        Q = self.query(query)
        K = self.key(keys)
        V = self.value(values)

        energy = tf.matmul(Q, tf.transpose(K, perm=[0, 2, 1])) / self.scale

        if mask is not None:
            energy = tf.where(mask == 0, -1e20, energy)

        attention = tf.nn.softmax(energy, axis=-1)

        out = tf.matmul(attention, V)

        return out

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(EncoderLayer, self).__init__()
        self.attention = SelfAttention(d_model)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.fc_query = tf.keras.layers.Dense(d_model)
        self.fc_key = tf.keras.layers.Dense(d_model)
        self.fc_value = tf.keras.layers.Dense(d_model)
        self.fc_out = tf.keras.layers.Dense(d_model)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, src, mask):
        query = self.fc_query(src)
        key = self.fc_key(src)
        value = self.fc_value(src)

        src2 = self.attention(value, key, query, mask)
        src2 = self.norm1(src2 + src)

        src = self.fc_out(src2)
        src2 = self.norm2(src + src2)

        return src2




class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_encoders):
        super(Encoder, self).__init__()

        self.embed_size = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, self.embed_size)
        self.encoders = [EncoderLayer(self.embed_size) for _ in range(num_encoders)]
        self.fc_out = tf.keras.layers.Dense(vocab_size)

    def call(self, x, mask):
        x = self.embedding(x)
        for layer in self.encoders:
            x = layer(x, mask)
        out = x
        x = self.fc_out(x)

        return tf.nn.softmax(x, axis=-1), out


In [3]:
def preprocess_sentence(sentence, tokenizer,max_length=512):

    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_length - 2]
    padded_tokens = tokens + ['[PAD]'] * (max_length - len(tokens))

    # Convert tokens to indices
    input_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

    masked_indices = torch.rand(len(input_ids)) < 0.15
    masked_input_ids = input_ids.copy()
    for i in range(len(masked_indices)):
        if masked_indices[i]:
            masked_input_ids[i] = tokenizer.mask_token_id

    return masked_input_ids, input_ids

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("/content/hotel_reviews.csv")

df.head(10)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
5,love monaco staff husband stayed hotel crazy w...,5
6,"cozy stay rainy city, husband spent 7 nights m...",5
7,"excellent staff, housekeeping quality hotel ch...",4
8,"hotel stayed hotel monaco cruise, rooms genero...",5
9,excellent stayed hotel monaco past w/e delight...,5


In [None]:
import tensorflow as tf
from transformers import BertTokenizer

sentences = list(df['Review'].values)[:100]

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# import tensorflow as tf
# from transformers import BertTokenizer

# sentences = list(df['Review'].values)[:100]

# # Load tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


input_tensors = []
output_tensors = []
for sentence in sentences:
    masked_input_ids, output_ids = preprocess_sentence(sentence, tokenizer)
    input_tensors.append(tf.constant(masked_input_ids))
    output_tensors.append(tf.constant(output_ids))


input_tensors = tf.stack(input_tensors)
output_tensors = tf.stack(output_tensors)


dataset = tf.data.Dataset.from_tensor_slices((input_tensors, output_tensors))


for inputs, outputs in dataset.take(1):
    print("Input:", inputs)
    print("Output:", outputs)




Input: tf.Tensor(
[ 3835  3309  6450  5581  2288  2204  3066   103  3309  5315  1010  3369
  2397  3944  2165  6040  3025  4391  2106 27238  5581   103  4638  4248
   103  1010  2210  9364   103  1011 25953  3193  2282  2282  4550   103
  2946  1010   103   103  8271 10551   103  2152   103  1010   103  2614
   103  2066   103  2189  2282  2305   103  5189   103  4303  3098  5494
  2963  2111  3331  6797  1010  2672  2074   103 10638   103 13642  2850
  7198  3688  3835  1010  2106  2025   103  7529  2994   103  3543  2579
  5056   103   103  1010  3295   103   103  3292  6023   103  3452  3835
  3325  2383  3477  2871  5581  2305  1010     0     0     0   103     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(128,), dtype=int32)
Output: tf.Tensor(
[ 3835  3309  6450  5581  2288  2204  3066  2994  3309  5315  1010  3369
  2397  3944  2165  6040  3025  4391  2106 27238  5581  1010  4638  4248
  3733  1

In [None]:
input_tensors.shape

In [None]:
import tensorflow as tf


dataset = tf.data.Dataset.from_tensor_slices((input_tensors, output_tensors))
custom_dataloader = dataset.shuffle(buffer_size=len(input_tensors)).batch(batch_size=1)


for inputs, outputs in dataset.take(1):
    print("Input:", inputs)
    print("Output:", outputs)


Input: tf.Tensor(
[[  103  3095  1010  2160   103   103  3309 16480 18141  3095  2191  2514
    103  1010  5281 11813  2326  4624  3095  9530   103  3351  2341  2273
  10850  2326  3791   103  1010 10850  3478   103  8697  3329  2793  6013
    103  2098  8697  2109  1010  3095  4248  3051 10395  5800   103   103
   3277  1010  3984   103  7904  2025  9185  2717  3095  1012  2057  2363
   6581  6040  9530 19562   103  4953   103  4648  7666  2181  3407  3178
    103 18767  3835  3543  3095  2253  2126  2191  2514  2188  1012  2307
    103  2066  2485  2204  2833   103  2165  2377  4833  2395  1996  8988
   2121  2092  1012 12694  2015  3006  7156  2675  3229 18847   103  2460
   3788  3292  1010   103     0   103     0   103     0     0     0     0
      0   103     0     0     0     0     0     0]], shape=(1, 128), dtype=int32)
Output: tf.Tensor(
[[ 6581  3095  1010  2160 18321  3737  3309 16480 18141  3095  2191  2514
   2188  1010  5281 11813  2326  4624  3095  9530 19562  3351  2341

In [None]:
vocab_size = len(tokenizer)
embed_size = 512

In [None]:
custom_model = Encoder(vocab_size, embed_size,12)

In [None]:
custom_criterion = nn.CrossEntropyLoss()
custom_optimizer = torch.optim.Adam(custom_model.parameters(), lr=0.01)


num_epochs = 10
tokens = tokenizer.tokenize("hotel")


input_ids = custom_tokenizer.convert_tokens_to_ids(tokens)


input_tensor = torch.tensor([input_ids])

word_embedding = custom_model(input_tensor,None)[1]


for epoch in range(num_epochs):

    custom_model.train()
    total_loss = 0
    for custom_inputs, custom_outputs in custom_dataloader:
        custom_optimizer.zero_grad()
        custom_inputs = custom_inputs.squeeze(0)
        custom_outputs = custom_outputs.squeeze(0)
        custom_mask = (custom_inputs != custom_tokenizer.pad_token_id)
        custom_predictions = custom_model(custom_inputs, custom_mask)[0]
        custom_loss = custom_criterion(custom_predictions.view(-1, vocab_size), custom_outputs.view(-1))
        custom_loss.backward()
        custom_optimizer.step()
        total_loss += custom_loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(custom_dataloader)}")


Epoch 1, Loss: 10.311819410324096
Epoch 2, Loss: 10.321818885803223
Epoch 3, Loss: 10.321818885803223
Epoch 4, Loss: 10.321818885803223
Epoch 5, Loss: 10.321818885803223
Epoch 6, Loss: 10.321818885803223
Epoch 7, Loss: 10.321818885803223
Epoch 8, Loss: 10.321818885803223
Epoch 9, Loss: 10.321818885803223
Epoch 10, Loss: 10.321818885803223


In [None]:

tokens = custom_tokenizer.tokenize("hotel")

input_ids = custom_tokenizer.convert_tokens_to_ids(tokens)


input_tensor = torch.tensor([input_ids])

word_embedding = custom_model(input_tensor,None)[1]

print(word_embedding[0][0])

word_embedding[0].shape[0], word_embedding[0].shape[1]

tensor([-2.2622e+00,  9.3887e-02,  5.8753e-02, -4.9394e-01, -1.4677e+00,
        -4.9152e-02,  1.0937e+00, -7.8677e-01, -1.9292e-01, -7.0869e-01,
        -5.7480e-01, -5.7149e-01,  9.1257e-03, -1.0429e+00,  2.1709e+00,
         1.6041e+00,  5.8469e-01,  6.9175e-01, -2.7218e+00, -3.3456e-01,
        -4.8865e-02,  5.2054e-02,  5.3760e-01, -1.9123e+00, -1.1977e+00,
        -1.1849e+00, -1.7030e+00,  2.1966e-01, -6.2985e-01,  1.0435e+00,
        -2.1188e-01,  3.2069e-01,  1.1573e+00, -4.1223e-01,  1.4358e-01,
         1.2860e+00, -2.1283e+00, -1.7207e+00,  6.8427e-01,  3.4669e+00,
        -2.8630e-01, -1.3486e+00, -1.0310e+00,  4.2801e-01,  1.9752e-01,
        -1.7480e-01, -1.7305e+00,  9.1612e-02,  2.8055e+00,  1.0247e-01,
         7.2090e-01, -7.4195e-01, -8.5832e-01, -7.2608e-01,  6.8805e-01,
        -1.9124e+00, -3.8446e-01, -1.5030e+00,  5.9701e-01, -2.6451e-01,
        -7.2376e-01, -4.9049e-02,  3.5458e-03, -7.9115e-01, -1.8790e+00,
        -2.4470e-01,  1.5993e-01,  1.6193e+00,  8.0

(1, 512)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
import tensorflow as tf

def cos_sim_tensor(tensor1, tensor2):
    tensor1_normalized = tf.math.l2_normalize(tensor1, axis=-1)
    tensor2_normalized = tf.math.l2_normalize(tensor2, axis=-1)


    similarity = tf.keras.losses.cosine_similarity(tensor1_normalized, tensor2_normalized, axis=-1)

    return similarity




In [None]:
import numpy as np

In [None]:
import tensorflow as tf

def cos_sim_tensor(tensor1, tensor2):
    tensor1_normalized = tf.math.l2_normalize(tensor1, axis=-1)
    tensor2_normalized = tf.math.l2_normalize(tensor2, axis=-1)


    similarity = tf.keras.losses.cosine_similarity(tensor1_normalized, tensor2_normalized, axis=-1)

    return similarity



def cos_sim(text1, text2):
    tokens_1 = custom_tokenizer.tokenize(text1)
    tokens_2 = custom_tokenizer.tokenize(text2)

    input_ids_1 = custom_tokenizer.convert_tokens_to_ids(tokens_1)
    input_ids_2 = custom_tokenizer.convert_tokens_to_ids(tokens_2)


    input_tensor_1 = torch.tensor([input_ids_1])
    input_tensor_2 = torch.tensor([input_ids_2])

    with torch.no_grad():
        word_embedding_1 = custom_model(input_tensor_1,None)[1]
        word_embedding_2 = custom_model(input_tensor_2,None)[1]

    return round(np.abs(cos_sim_tensor(word_embedding_1, word_embedding_2)[0])[0], 2)

cos_sim("hotel", "restaurant")


0.89


In [None]:
cos_sim("hotel", "room")

0.73


In [None]:
cos_sim("hotel", "car")

0.39
