### 0. Pinecone Tutorial
참고 URL
- https://www.datacamp.com/tutorial/mastering-vector-databases-with-pinecone-tutorial
- https://docs.pinecone.io/docs/query-data

In [15]:
import pinecone
API_KEY = "12dfbe87-05b3-4243-bb22-e69d329f18ed"
ENVIRONMENT = "gcp-starter"
pinecone.init(api_key = API_KEY, environment = ENVIRONMENT)

In [16]:
# CREATE / INSERT
pinecone.create_index("myfirstindex", dimension = 8, metric="euclidean")
index = pinecone.Index("myfirstindex")
index.upsert([
    ("A", [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]),
    ("B", [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]),
    ("C", [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
    ("D", [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]),
    ("E", [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
])

{'upserted_count': 5}

In [17]:
index.describe_index_stats() # Q. 디멘션 같은 벡터들만 저장

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [19]:
# Retrieve by ID
index.query(
    id = 'A',
    top_k = 1,
    include_values = True
) # include_values (bool): Indicates whether vector values are included in the response.

{'matches': [{'id': 'A',
              'score': 0.0,
              'values': [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]}],
 'namespace': ''}

In [5]:
# Retrieve by vector
index.query(
    vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
    top_k=3,
    include_values = True
) # include_values (bool): Indicates whether vector values are included in the response.

{'matches': [{'id': 'C',
              'score': 0.0,
              'values': [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
             {'id': 'D',
              'score': 0.0799999237,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]},
             {'id': 'B',
              'score': 0.0800000429,
              'values': [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]}],
 'namespace': ''}

In [6]:
# Delete
pinecone.delete_index("myfirstindex")

### 1. Spotipy API
- Load features for songs

In [19]:
# Load data with Spotify API
#!pip install spotipy
# authentification
SPOTIFY_CLIENT_ID="0be6da6b7294432d9545984f45c51b95"
SPOTIFY_CLIENT_SECRET="0b6b0ca6d4814e3499623359cd5df5b3"
SPOTIFY_REDIRECT_URI="http://localhost:8888/callback"

# logging in
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id = SPOTIFY_CLIENT_ID, client_secret = SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

# bring track data
artist_name = []
track_name = []
track_popularity = []
track_results = sp.search(q = 'year:2023', type = 'track', limit = 50) # json format

# audio features
sp.artist('3TVXtAsR1Inumwj472S9r4')['popularity']
sp.artist_related_artists('3TVXtAsR1Inumwj472S9r4')
sp.audio_features('spotify:track:6pD0ufEQq0xdHSsRbg9LBK')
audio_analysis = sp.audio_analysis('spotify:track:6pD0ufEQq0xdHSsRbg9LBK')
audio_features = sp.audio_features('spotify:track:6pD0ufEQq0xdHSsRbg9LBK')
audio_analysis

{'meta': {'analyzer_version': '4.0.0',
  'platform': 'Linux',
  'detailed_status': 'OK',
  'status_code': 0,
  'timestamp': 1681510246,
  'analysis_time': 10.99544,
  'input_process': 'libvorbisfile L+R 44100->22050'},
 'track': {'num_samples': 4290119,
  'duration': 194.56322,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 0.19732,
  'start_of_fade_out': 186.35754,
  'loudness': -4.076,
  'tempo': 83.118,
  'tempo_confidence': 0.025,
  'time_signature': 4,
  'time_signature_confidence': 0.984,
  'key': 6,
  'key_confidence': 0.037,
  'mode': 0,
  'mode_confidence': 0.267,
  'codestring': 'eJxVmtmVxDgOBF2RCbwP_x3biGT1zOxPvyZLEkkciQTAde6qZ935la-OM1tbu371tq_NW0fvtX888o1W9ymr8Vvf67tttjlv21-dfX219HnGKaf5avnumrvUccvX2ii83MYuq96vrTa-tWYrd0_WuLV9c_FzHevrnWUvC83eavn6qnz57j1m3d-o7XyTNfvp9-Nzwx0WtrHaN9zi7KfuNtb9Zhvnu6W0wl73Nzffdc3BFus37xlfr7sXjspu6hjfPmsP5fCtwafW4pS1cpzFsThe261UdvKxbvG4brJ3xq38jffe9

In [14]:
audio_analysis.keys()

dict_keys(['meta', 'track', 'bars', 'beats', 'sections', 'segments', 'tatums'])

In [22]:
# audio_analysis['segments'][0]
audio_features[0].keys() # features

dict_keys(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'])

In [41]:
# feature sample
sample_features = []

for key in audio_features[0].keys():
    val = audio_features[0][key]
    if key == 'type':
        break
    if key == 'key':
        val = (val+1)/12
    if key == 'loudness':
        val = (val+60)/60

    sample_features.append(val)

In [34]:
audio_features[0].keys()
# key: [-1,11]
# loudness: [-60,0]
# mode: 0(minor)/1(major)
# tempo: [0,inf)

dict_keys(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'])

In [42]:
sample_features = sample_features[:-1]
sample_features

[0.569,
 0.724,
 0.5833333333333334,
 0.9320666666666667,
 0,
 0.0474,
 0.228,
 0,
 0.27,
 0.562]

In [49]:
# Store the vector into the pinecone database
import pinecone
API_KEY = "12dfbe87-05b3-4243-bb22-e69d329f18ed"
ENVIRONMENT = "gcp-starter"
pinecone.init(api_key = API_KEY, environment = ENVIRONMENT)
pinecone.create_index("musicfeatureindex", dimension = len(sample_features), metric = 'cosine')
index = pinecone.Index("musicfeatureindex")
index.upsert([("A", sample_features)])

{'upserted_count': 1}

### 2. LyricsGenius API
- retrieve highlight lyrics of songs which contain specific words

In [1]:
# LyricGenius
from lyricsgenius import Genius
CLINET_ACCESS_TOKEN = "qsSXCSX9lcSE12sICwz6QRtk81ahtbBpbAxmw-y7Gg8KEGTZw2DH1DOhNE0SDmHM"
genius = Genius(CLINET_ACCESS_TOKEN)
results = genius.search_lyrics('calm',per_page=1, page=1)

In [2]:
lyrics = results['sections'][0]['hits'][0]['highlights'][0]['value']

In [3]:
# sentence embedding 뽑아와서 pinecone에 저장해보기
# tokenizer, bert model
# 1. sentence embedding 뽑기 with pre-trained bert
# 2. store the embeddings into the pinecone DB

In [4]:
# tokenizer
from transformers import AutoTokenizer, BertModel, BertConfig
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(lyrics)
print(f"lyrics: {lyrics}\ninput ids: {tokens['input_ids']}\ntoken type ids: {tokens['token_type_ids']}\nattention mask: {tokens['attention_mask']}")

  from .autonotebook import tqdm as notebook_tqdm


lyrics: And they wishin' and wishin'
And wishin' and wishin', they wishin' on me
Yeah
I been movin' calm, don't
input ids: [101, 1998, 2027, 4299, 2378, 1005, 1998, 4299, 2378, 1005, 1998, 4299, 2378, 1005, 1998, 4299, 2378, 1005, 1010, 2027, 4299, 2378, 1005, 2006, 2033, 3398, 1045, 2042, 9587, 6371, 1005, 5475, 1010, 2123, 1005, 1056, 102]
token type ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [5]:
tokenizer.convert_ids_to_tokens(tokens['input_ids'])

['[CLS]',
 'and',
 'they',
 'wish',
 '##in',
 "'",
 'and',
 'wish',
 '##in',
 "'",
 'and',
 'wish',
 '##in',
 "'",
 'and',
 'wish',
 '##in',
 "'",
 ',',
 'they',
 'wish',
 '##in',
 "'",
 'on',
 'me',
 'yeah',
 'i',
 'been',
 'mo',
 '##vin',
 "'",
 'calm',
 ',',
 'don',
 "'",
 't',
 '[SEP]']

In [7]:
import torch

tokens_tensor = torch.tensor([tokens['input_ids']])
segments_tensors = torch.tensor([tokens['token_type_ids']])

model = BertModel.from_pretrained('bert-base-uncased')

with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

In [25]:
lyrics_features = outputs['pooler_output'].squeeze().tolist()

import pinecone
# Delete
API_KEY = "12dfbe87-05b3-4243-bb22-e69d329f18ed"
ENVIRONMENT = "gcp-starter"
pinecone.init(api_key = API_KEY, environment = ENVIRONMENT)
# pinecone.delete_index("musicfeatureindex")

# pinecone.create_index("lyricsfeatureindex", dimension = len(lyrics_features), metric = 'cosine')
index = pinecone.Index("lyricsfeatureindex")
index.upsert([("A", lyrics_features)])


{'upserted_count': 1}