# KeyBert Model - Keyword Extraction

In [None]:
!pip install keybert

In [2]:
from keybert import KeyBERT

In [None]:
kw_model = KeyBERT()

In [None]:
# kw_model.extract_keywords(mytext, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.2)

In [None]:
# kw_model.extract_keywords(mytext, keyphrase_ngram_range=(1, 2))
# kw_model.extract_keywords(mytext, stop_words='english', use_mmr=True, diversity=0.2, top_n=10)

 Speech Text

In [None]:
mytext = "we run up the mountain yesterday the night sky was full of stars I've never seen so many stars in the sky before it was such a heavenly view forever remeber it in the bottom of my heart"

Song Lyrics Text (Song: NF - The Search)

In [None]:
song_lyrics = '''
I watch 'em all pass by
The moon and the stars
Let me hold you in my arms forevermore
These cold nights, the park is ours
Standing by the side
Let you go, oh to the sea, just for me
Don't ever let me, my love
Keep holding on
Let the modest go
As my mic goes to and fro
Waking up for one more show
We see him in the night
Tell him I'm not afraid of him
I'm not afraid of him
'Cause I won't know
'''

In [None]:
speech_keywords = kw_model.extract_keywords(mytext, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.2)

In [None]:
lyrics_keywords = kw_model.extract_keywords(song_lyrics, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.2, top_n=2)

In [None]:
speech_keywords

[('stars sky', 0.64),
 ('seen stars', 0.5786),
 ('night sky', 0.5531),
 ('heavenly view', 0.4849),
 ('mountain yesterday', 0.4686)]

In [None]:
lyrics_keywords

[('cold nights', 0.3431), ('love holding', 0.31)]

In [None]:
speech_keywords = [('night sky', 0.544),
('mountain yesterday', 0.478),
('heavenly view', 0.471),
('many stars', 0.449),
('heart', 0.218)
]

lyrics_keywords = [('cold nights', 0.282), ('love', 0.221)]

# Sentence Transformer - Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import numpy as np

In [None]:
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Get embeddings for all speech keywords
speech_embeddings = np.array([model.encode(speech_kw[0]) for speech_kw in speech_keywords])

# Get embeddings for all lyrics keywords
lyrics_embeddings = np.array([model.encode(lyrics_kw[0]) for lyrics_kw in lyrics_keywords])

# Calculate cosine similarity between all pairs of embeddings
similarity_matrix = cosine_similarity(speech_embeddings, lyrics_embeddings)

# Display similarity matrix
print("Similarity matrix:")
similarity_matrix

Similarity matrix:


array([[0.57791287, 0.24088761],
       [0.29231182, 0.14264202],
       [0.2226991 , 0.2615198 ],
       [0.25822335, 0.29238024],
       [0.21378422, 0.5969655 ]], dtype=float32)

In [None]:
# Extract similarity scores for each pair of keywords
similarity_scores = []
for i, speech_kw in enumerate(speech_keywords):
    for j, lyrics_kw in enumerate(lyrics_keywords):
        similarity_scores.append((speech_kw[0], lyrics_kw[0], similarity_matrix[i, j]))

# Display similarity scores
print("\nSimilarity scores between speech and lyrics keywords:")
for speech_kw, lyrics_kw, score in similarity_scores:
    print(f"{speech_kw} - {lyrics_kw}: {score:.4f}")


Similarity scores between speech and lyrics keywords:
night sky - cold nights: 0.5779
night sky - love: 0.2409
mountain yesterday - cold nights: 0.2923
mountain yesterday - love: 0.1426
heavenly view - cold nights: 0.2227
heavenly view - love: 0.2615
many stars - cold nights: 0.2582
many stars - love: 0.2924
heart - cold nights: 0.2138
heart - love: 0.5970


In [None]:
similarity_scores

[('night sky', 'cold nights', 0.57791287),
 ('night sky', 'love', 0.24088761),
 ('mountain yesterday', 'cold nights', 0.29231182),
 ('mountain yesterday', 'love', 0.14264202),
 ('heavenly view', 'cold nights', 0.2226991),
 ('heavenly view', 'love', 0.2615198),
 ('many stars', 'cold nights', 0.25822335),
 ('many stars', 'love', 0.29238024),
 ('heart', 'cold nights', 0.21378422),
 ('heart', 'love', 0.5969655)]

In [None]:
# Initialize a dictionary to store the data
data_dict = {}

# Loop through the similarity scores and organize them by id
for id1, id2, similarity in similarity_scores:
    if id1 not in data_dict:
        data_dict[id1] = []

    data_dict[id1].append({'x': id2, 'y': similarity})

# Convert the dictionary to a list of objects
formatted_data = [{'id': key, 'data': value} for key, value in data_dict.items()]

formatted_data

[{'id': 'night sky',
  'data': [{'x': 'cold nights', 'y': 0.57791287},
   {'x': 'love', 'y': 0.24088761}]},
 {'id': 'mountain yesterday',
  'data': [{'x': 'cold nights', 'y': 0.29231182},
   {'x': 'love', 'y': 0.14264202}]},
 {'id': 'heavenly view',
  'data': [{'x': 'cold nights', 'y': 0.2226991},
   {'x': 'love', 'y': 0.2615198}]},
 {'id': 'many stars',
  'data': [{'x': 'cold nights', 'y': 0.25822335},
   {'x': 'love', 'y': 0.29238024}]},
 {'id': 'heart',
  'data': [{'x': 'cold nights', 'y': 0.21378422},
   {'x': 'love', 'y': 0.5969655}]}]

In [None]:
# Calculate cosine similarity between all pairs of embeddings
similarity_matrix = cosine_similarity(speech_embeddings, lyrics_embeddings)

# Flatten the similarity matrix to calculate overall similarity
flat_similarity = similarity_matrix.flatten()

# Calculate the overall similarity percentage
overall_similarity_percentage = np.mean(flat_similarity) * 100

print("Overall similarity percentage:", overall_similarity_percentage)

Overall similarity percentage: 30.99326491355896


# Test

In [None]:
reversed_similarity_dict = {}

for j, lyrics_kw in enumerate(lyrics_keywords):
    lyrics_word = lyrics_kw[0]
    similarity_scores = []

    for i, speech_kw in enumerate(speech_keywords):
        speech_word = speech_kw[0]
        score = similarity_matrix[i, j]
        similarity_scores.append((speech_word, score))

    reversed_similarity_dict[lyrics_word] = similarity_scores

reversed_similarity_dict

# result_list = list(reversed_similarity_dict.items())

# x_labels = [item[0] for item in result_list]
# y_labels_set = set(kw[0] for _, kw_list in result_list for kw in kw_list)
# y_labels = sorted(list(y_labels_set))
# data = [[next((score for keyword, score in kw_list if keyword == label), 0.0) for label in y_labels] for _, kw_list in result_list]

# result_list_def = {"x_labels": x_labels, "y_labels": y_labels, "data": data}

# result_list_def

{'cold nights': [('night sky', 0.57791287),
  ('mountain yesterday', 0.29231182),
  ('heavenly view', 0.2226991),
  ('many stars', 0.25822335),
  ('heart', 0.21378422)],
 'love': [('night sky', 0.24088761),
  ('mountain yesterday', 0.14264202),
  ('heavenly view', 0.2615198),
  ('many stars', 0.29238024),
  ('heart', 0.5969655)]}