In [5]:
# Import Libraries
import os
import torch
from transformers import AutoTokenizer, AutoModel


LYRICS_FILES = 'lyrics'

In [6]:
# Get the Bert-base-uncased model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

lyrics_data = []
lyrics_files = os.listdir(LYRICS_FILES)

# Get the lyrics of each file 
for file_path in lyrics_files:
    if str(file_path).__contains__('.txt'):
        with open(os.path.join(LYRICS_FILES, file_path), "r") as f:
            lyrics = f.read()
            lyrics_data.append(lyrics)

# Define the tokenizer model
encoded_data = tokenizer(lyrics_data, padding=True, truncation=True, max_length=512, return_tensors="pt")


# Create an embedding for the lyrics data based on the last hidden layer 
with torch.no_grad():
    model_output = model(**encoded_data)
    embeddings = model_output.last_hidden_state[:, 0, :]


# Save the embeddings
embedding_file = '/Users/stamatiosorphanos/Documents/MultiModal-Deep-Learning/MultiModal/saved_models/embeddings.pt'
torch.save(embeddings, embedding_file)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

### Load the saved model and check for a test song

In [3]:
# Define the text similarity function 
def text_similarity(file_path, embedding_file, top_results):
    saved_embeddings = torch.load(embedding_file)
    test_lyrics = file_path
    
    with open(test_lyrics, "r") as f:
        test_lyrics = f.read()

    encoded_test = tokenizer(test_lyrics, padding=True, truncation=True, max_length=512, return_tensors="pt")

    with torch.no_grad():
        test_output = model(**encoded_test)
        test_embedding = test_output.last_hidden_state[:, 0, :]

    similarity_scores = torch.nn.functional.cosine_similarity(test_embedding, saved_embeddings)
    sorted_scores, sorted_indices = similarity_scores.sort(descending=True)

    k = top_results
    top_k_scores = sorted_scores[:k]
    top_k_indices = sorted_indices[:k]

    print("Top {} similar songs:".format(k))
    for score, index in zip(top_k_scores, top_k_indices):
        print("Song:", lyrics_files[index], "Score:", score.item())


In [4]:
file_path = 'test.txt'
embedding_path = "saved_models/embeddings.pt"
top_results = 10

text_similarity(file_path, embedding_path, top_results)

Top 10 similar songs:
Song: Calvin-Harris-Dua-Lipa-One-Kiss.txt Score: 0.9376112818717957
Song: The-Weeknd-Out-of-Time.csv Score: 0.9279755353927612
Song: Taylor-Swift-Delicate.csv Score: 0.9247044324874878
Song: Taylor-Swift-You-Need-To-Calm-Down.csv Score: 0.9153817296028137
Song: Imagine-Dragons-Thunder.txt Score: 0.9125005006790161
Song: Bruno-Mars-That-s-What-I-Like.txt Score: 0.9101618528366089
Song: Conan-Gray-Heather.txt Score: 0.905781090259552
Song: M-neskin-Beggin.csv Score: 0.9048627614974976
Song: Glass-Animals-Heat-Waves.txt Score: 0.9045715928077698
Song: Kygo-Ellie-Goulding-First-Time.txt Score: 0.9004665613174438
