In [170]:
import sys
import os
import pandas as pd
import ollama
import json

# Add the root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [171]:
from configurations import AUDIO_AND_LYRICS_TABLE_NAME, AUDIO_LYRICS_AND_FEATURE_SUMAMRIZER  

In [None]:

def extract_tags_content(content, tags_list):
    """
    Extract content from specified tags and return it as a formatted string.
    
    Args:
        content (str): The input text containing tagged content
        tags_list (list): List of tag names to extract
    
    Returns:
        str: Formatted string with all extracted content
    """
    result = []
    for tag in tags_list:
        # Regex pattern that handles potential malformed XML and duplicate tags
        pattern = f"<{tag}>(.*?)</{tag}>"
        matches = re.findall(pattern, content, re.DOTALL)
        
        if matches:
            for match in matches:
                # Clean up the extracted content (remove leading/trailing whitespace)
                cleaned_content = match.strip()
                # Add the tagged content with a header to the result
                result.append(f"{tag}:\n{cleaned_content}\n")
    
    # Join all extracted content with double line breaks for UI display
    return "\n".join(result)

In [172]:

def audio_features_to_json(df: pd.DataFrame):
    """
    Convert a DataFrame containing audio features to a JSON format suitable for LLM prompting.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns for audio features
    
    Returns:
    str: JSON string representation of the audio features
    """
    # Ensure the DataFrame has the expected columns
    expected_columns = [
        'danceability', 'energy', 'key', 'loudness', 'mode', 
        'speechiness', 'acousticness', 'instrumentalness', 
        'liveness', 'valence', 'tempo'
    ]
    
    # Create a dictionary for each row in the DataFrame
    features_list = []
    for _, row in df.iterrows():
        # Extract features as a dictionary
        features = {}
        for col in expected_columns:
            if col in df.columns:
                features[col] = float(row[col])
            else:
                print(f"Warning: Column '{col}' not found in DataFrame")
        
        features_list.append(features)
    
    # Convert to JSON string with indentation for readability
    json_str = json.dumps(features_list, indent=2)
    return json_str

In [173]:
from credentials import SUPABASE_URL, SUPABASE_KEY
from supabase import create_client, Client
import asyncio
from itertools import islice
import pandas as pd

key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZmZHVkcnR3amt6eWRnZWNoaWdnIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTczOTA0MjcwNiwiZXhwIjoyMDU0NjE4NzA2fQ.HAkpWrJbxsdl9UhQGt0jxTmo4KPNr9qYutXW6DFkDT4"

# Initialize the client
async def create_supabase_connection():
    supabase: Client = create_client("https://ffdudrtwjkzydgechigg.supabase.co", key)
    print("Supabase connection created: ", supabase)
    return supabase

In [174]:
supabase_client = await create_supabase_connection()

Supabase connection created:  <supabase._sync.client.SyncClient object at 0x16b86f640>


In [175]:
async def fetch_all_data_from_table(supabase_client, table_name):
    """
    Fetch all data from a specified table in Supabase.
    
    Parameters:
    -----------
    supabase_client : Client
        An initialized Supabase client object
    table_name : str
        The name of the table to fetch data from
        
    Returns:
    --------
    dict
        A dictionary containing all the data from the table
    
    Raises:
    -------
    Exception
        If there is an error in fetching the data
    """
    try:
        # Query the table to get all rows
        response = supabase_client.table(table_name).select('*').execute()
        
        # Extract the data from the response
        data = response.data
        
        print(f"Successfully fetched {len(data)} rows from table '{table_name}'")
        return data
    
    except Exception as e:
        print(f"Error fetching data from table '{table_name}': {str(e)}")
        raise

In [176]:
df = await fetch_all_data_from_table(supabase_client,AUDIO_AND_LYRICS_TABLE_NAME)

Successfully fetched 4822 rows from table 'summary_lyrics_plus_features'


In [178]:
def format_prompt(lyrics, audio_features_json):
    """
    Format the prompt with the actual lyrics and audio features.
    
    Parameters:
    -----------
    lyrics (str): The song lyrics
    audio_features_json (str): JSON string of audio features
    
    Returns:
    --------
    str: The formatted prompt ready to send to the model
    """
    # Replace the placeholders with actual data
    formatted_prompt = AUDIO_LYRICS_AND_FEATURE_SUMAMRIZER.replace(
        "{{LYRICS}}", lyrics
    ).replace(
        "{{AUDIO_FEATURES}}", audio_features_json
    )
    
    return formatted_prompt

In [179]:
async def prompt_ollama_model(lyrics, audio_features):
    
    formatted_prompt = format_prompt(lyrics, audio_features)

    print("updated prompt is: ", formatted_prompt)

    response = ollama.chat(
        model='gemma3:12b',
        messages=[
            {
                'role': 'user',
                'content': formatted_prompt
            }
        ]
    )
    print("response is: ", response['message']['content'])
    return response['message']['content']

In [180]:
sample_df = df.head(5)

In [None]:
import pandas as pd
import json
import ollama
import asyncio
import nest_asyncio

# Apply nest_asyncio to allow nested event loops (important for Jupyter notebooks)
nest_asyncio.apply()

async def process_song_data(df):
    """
    Process each row in the DataFrame, extract audio features, and prompt Ollama model
    with lyrics and audio features.
    """
    results = []
    
    for index, row in df.iterrows():
        try:
            print(f"Processing row {index + 1}/{len(df)}")
            
            # Extract lyrics
            lyrics = row.get('lyrics', '')
            if not lyrics or pd.isna(lyrics):
                print(f"Warning: No lyrics found for row {index}")
                continue
            
            # Create a single-row DataFrame with just the audio features from this row
            audio_features_df = pd.DataFrame([row])
            
            # Use function 2 to convert audio features to JSON
            audio_features_json = audio_features_to_json(audio_features_df)
            
            # Use function 1 to get response from Ollama
            response = await prompt_ollama_model(lyrics, audio_features_json)
            
            cleaned_response = extract_tags_content(response, ['song_description'])

            # Store result with index for reference
            results.append({
                'index': index,
                'spotify_id': row.get('spotify_id', f'unknown_{index}'),
                'name': row.get('name', f'unknown_{index}'),
                'response': cleaned_response
            })
            
            # Optional: add a delay to avoid rate limits
            await asyncio.sleep(0.5)
            
        except Exception as e:
            print(f"Error processing row {index}: {str(e)}")
            continue
    
    return results

def run_analysis(df):
    """
    Run the async processing function and return results
    """
    try:
        # Create a new event loop explicitly
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        results = loop.run_until_complete(process_song_data(df))
    except RuntimeError as e:
        if "This event loop is already running" in str(e):
            # If we're in a Jupyter notebook or similar environment with an existing event loop
            print("Using nest_asyncio to run with existing event loop")
            results = asyncio.run(process_song_data(df))
        else:
            raise e
    
    # Convert results to DataFrame for easier analysis
    results_df = pd.DataFrame(results)
    
    print(f"Processed {len(results_df)} songs successfully")
    return results_df

In [182]:
results = run_analysis(sample_df)

Processing row 1/5
updated prompt is:  
You are tasked with creating a comprehensive textual representation of a song by analyzing both its lyrics and audio features. This representation will be used in a recommendation system that matches songs to user preferences through cosine similarity.

You will be provided with two inputs:

1. Complete song lyrics:
<lyrics>
Is it easier to stay? Is it easier to go?
I don't wanna know, oh
But I know that I'm never, ever gonna change
And you know you don't want it any other way


Why do we always gotta run away?
And we wind up in the same place
It's like we're looking for the same thing
Same thing, yeah
Yeah, do we really gotta do this now?
Right here with all your friends around
In the morning, we can work it out
Work it out


I love you so much that I hate you
Right now, it's so hard to blame you
'Cause you're so damn beautiful
So damn beautiful


Is it easier to stay? Is it easier to go?
I don't wanna know, oh
But I know that I'm never, ever go

In [183]:
results.to_csv('sample.csv')