In [1]:
import os

import numpy as np
import pandas as pd

import spacy
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

from openai import OpenAI
from dotenv import load_dotenv

from pymongo import MongoClient
from monggregate import Pipeline

In [2]:
df = pd.read_csv("data/57k_spotify_songs.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.shape

(57650, 4)

## Cost Estimation

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
def count_token(text):
    try:
        doc = nlp(text)
        return len(doc)
    except:
        return np.nan

In [4]:
openai_price_min = 0.00002
openai_price_med = 0.00010
openai_price_max = 0.00013

In [7]:
df["token_counts"] = df["text"].apply(count_token)
df["token_counts"].value_counts(dropna=False)

token_counts
210     251
188     249
184     248
183     247
228     247
       ... 
1134      1
1053      1
962       1
1028      1
1070      1
Name: count, Length: 957, dtype: int64

In [8]:
df.to_csv("data/all.csv", index=False)

In [9]:
df_re = pd.read_csv("data/all.csv")

In [10]:
df["price_max"] = df["token_counts"] * openai_price_max / 1000
df["price_med"] = df["token_counts"] * openai_price_med / 1000
df["price_min"] = df["token_counts"] * openai_price_min / 1000

In [11]:
df["price_max"].sum(), df["price_med"].sum(), df["price_min"].sum()

(2.1542136199999997, 1.6570874000000002, 0.33141748)

In [3]:
openai_client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)
model_name = "text-embedding-3-large"
max_dimensions = 2048
def _get_embedding(text:str, model:str=model_name):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model, dimensions=max_dimensions).data[0]

def get_embedding(text:str, model_name:str=model_name):
    try:
        return _get_embedding(text, model_name).embedding
    except Exception as e:
        print(e)
        return np.nan 

In [4]:
df["embedding"] = df["text"].apply(get_embedding)

Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.
Connection error.


In [5]:
df.to_csv("data/cache/57k_spotify_songs_with_embeddings.csv", index=False)
df.to_json("data/cache/57k_spotify_songs_with_embeddings.json")

In [7]:
df_loaded = pd.read_csv("data/cache/57k_spotify_songs_with_embeddings.csv")
df_loaded.head()

Unnamed: 0,artist,song,link,text,embedding
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd...","[0.06642276793718338, 0.029657185077667236, -0..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl...","[-0.0038928603753447533, -0.001233731512911617..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...,"[0.013708599843084812, -0.0019766672048717737,..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[-0.01959819346666336, -7.80902337282896e-05, ..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[-0.01218902412801981, -0.0025808592326939106,..."


In [8]:
df_loaded.describe(include="all")

Unnamed: 0,artist,song,link,text,embedding
count,57650,57650,57650,57650,57643
unique,643,44824,57650,57494,57495
top,Donna Summer,Have Yourself A Merry Little Christmas,/a/abba/ahes+my+kind+of+girl_20598417.html,I just came back from a lovely trip along the ...,"[-0.00998433493077755, 0.007366901263594627, -..."
freq,191,35,1,6,6
