# Sentence Transformers on AMD

In [None]:
!pip install datasets ipywidgets -U transformers sentence-transformers

# Building semantic search with Sentence-Transformers on AMD

In [None]:
from datasets import load_dataset
from sentence_transformers import InputExample, util
from torch.utils.data import DataLoader
from torch import nn
from sentence_transformers import losses
from sentence_transformers import SentenceTransformer, models

## 1.- Define the custom model to train

In [None]:
# Create a custom model
# Use an existing embedding model
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)

# Pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

# Dense function
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

# Define the overall model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])



## 2.- Sentence Compression Dataset for training
* Language: English
* Number of records 180 000
* Dataset with pairs of equivalent sentences.Large corpus of uncompressed and compressed sentences from news articles. 
* Useful for semantic search and sentence similarity.
* Dataset structure:
    * {"set": [sentence_1, sentence_2]}
    * {"set": [sentence_1, sentence_2]}
    * ...
    * {"set": [sentence_1, sentence_2]}



In [None]:
dataset_id = "embedding-data/sentence-compression"
dataset = load_dataset(dataset_id)

In [None]:
# Explore one sample
dataset['train']['set'][1]

### 2.1.- Transform dataset into required format for training

In [None]:
#convert dataset in required format
train_examples = []
train_data = dataset['train']['set']

n_examples = dataset['train'].num_rows//2 #select half of the dataset for training

for example in train_data[:n_examples]:
    original_sentence = example[0]
    compressed_sentence = example[1]
    
    input_example = InputExample(texts = [original_sentence, compressed_sentence])
    
    train_examples.append(input_example)


In [None]:
#Instantiate Dataloader with training examples
train_dataloader = DataLoader(train_examples, shuffle = True, batch_size = 16)

## 3.- Select loss function & Train

In [None]:
#Given the dataset of equivalent sentences, choose MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(model = model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs = 5)


## 4.- Test the model for semantic search

In [None]:
# from sentence_transformers import SentenceTransformer, util
import torch

# Sentences (documents/corpus) to encode
sentences = [
    'Paris, which is a city in Europe with traditions and remarkable food, is the capital of France',
    'The capital of France is Paris',
    'Australia is known for its traditions and remarkable food',
    """
        Despite the heavy rains that lasted for most of the week, the outdoor music festival, 
        which featured several renowned international artists, was able to proceed as scheduled, 
        much to the delight of fans who had traveled from all over the country
    """,
    """
        Photosynthesis, a process used by plans and other organisms to convert light into
        chemical energy, plays a crucial role in maintaining the balance of oxygen and carbon
        dioxide in the Earth's atmosphere.
    """
]

# Enconde the sentences
sentences_embeddings = model.encode(sentences, convert_to_tensor=True)


# Query sentences:
queries = ['Is Paris located in France?', 'Tell me something about Australia', 
           'music festival proceeding despite heavy rains',
           'what is the process that some organisms use to transform light into chemical energy?']


# Find the closest sentences of the corpus for each query using cosine similarity
for query in queries:
    
    # Enconde the current query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Cosine-similarity and closest document to query
    cos_scores = util.cos_sim(query_embedding, sentences_embeddings)[0]
    
    top_results = torch.argsort(cos_scores, descending = True)
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nSimilar sentences in corpus:")

    for idx in top_results:
        print(sentences[idx], "(Score: {:.4f})".format(cos_scores[idx]))