In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')



In [None]:
import os
import json
import pandas as pd
from textblob import TextBlob
from rake_nltk import Rake
from tqdm import tqdm
import gensim.corpora as corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Function to print topics with words
def print_topics(lda_model, id2word, num_words=10):
    for idx, topic in lda_model.print_topics(-1, num_words):
        print(f"Topic: {idx}\nWords: {topic}\n")

# Function to preprocess lyrics for sentiment and keyword extraction
def preprocess_lyrics(lyrics):
    return ' '.join([' '.join(verses) for verses in lyrics.values()])

# Function to preprocess lyrics for topic modeling
def preprocess_lyrics_for_topics(lyrics):
    stop_words = set(stopwords.words('english'))
    return [word for word in lyrics.lower().split() if word not in stop_words and word.isalpha()]

# Function to perform sentiment analysis
def sentiment_analysis(lyrics):
    blob = TextBlob(lyrics)
    return blob.sentiment.polarity

# Function to extract keywords from lyrics
def extract_keywords(lyrics):
    r = Rake()
    r.extract_keywords_from_text(lyrics)
    return r.get_ranked_phrases()[:10]

def get_topic_words(lda_model, topic_id, num_words=5):
    return [word for word, _ in lda_model.show_topic(topic_id, topn=num_words)]

# Path to your JSON files directory
json_files_dir = 'archive-new/json'
lyrics_corpus = []  # For topic modeling

# Prepare a list to collect data
data = []

# Process each JSON file
for file in tqdm(os.listdir(json_files_dir), desc='Processing JSON files'):
    file_path = os.path.join(json_files_dir, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

        # Extract necessary fields
        midi_file_path = json_data.get('midi_file_path', '')
        artist_name = json_data.get('artist_band_name', '')
        song_title = json_data.get('song_title', '')
        lyrics = json_data.get('lyrics', {})
        spotify_metadata = json_data.get('spotify_metadata', {})

        if lyrics:
            lyrics_text = preprocess_lyrics(lyrics)
            sentiment = sentiment_analysis(lyrics_text)
            keywords = extract_keywords(lyrics_text)
            lyrics_corpus.append(preprocess_lyrics_for_topics(lyrics_text))  # Add to topic modeling corpus

            # Add to data list
            data.append({
                'midi_file_path': midi_file_path,
                'artist_name': artist_name,
                'song_title': song_title,
                'sentiment': sentiment,
                'keywords': keywords,
                'spotify_metadata': spotify_metadata
            })

# Topic modeling
id2word = corpora.Dictionary(lyrics_corpus)
corpus = [id2word.doc2bow(text) for text in lyrics_corpus]
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto')

# print_topics(lda_model, id2word)

    
# Assign topics to songs with topic words
for i, row in enumerate(data):
    topics_distribution = lda_model.get_document_topics(corpus[i])
    dominant_topic = max(topics_distribution, key=lambda x: x[1])[0]
    topic_words = get_topic_words(lda_model, dominant_topic)
    row['topics'] = ' '.join(topic_words)


# # Assign topics to songs
# topics = [lda_model.get_document_topics(bow) for bow in corpus]
# for i, row in enumerate(data):
#     row['topics'] = topics[i]

# Convert to DataFrame
df = pd.DataFrame(data)


In [None]:
df.head(20)

In [None]:
def generate_prompt(row):
    # Extracting data from the row
    sentiment = row['sentiment']
    keywords = row['keywords']
    spotify_data = row['spotify_metadata']
    topics = row['topics']

    # Creating a base prompt
    prompt = f"{row['artist_name']}'s song '{row['song_title']}'"

    # Adding sentiment
    if sentiment > 0:
        prompt += " has a positive tone"
    elif sentiment < 0:
        prompt += " has a negative tone"
    else:
        prompt += " has a neutral tone"

    # Adding keywords
    if keywords:
        prompt += f" and includes themes like {', '.join(keywords[:3])}"

    # Handling spotify metadata
    if isinstance(spotify_data, str):
        try:
            spotify_dict = ast.literal_eval(spotify_data)  # Safely evaluate string to dictionary
        except ValueError:
            spotify_dict = {}
    else:
        spotify_dict = spotify_data

    # Adding Spotify data (example: genre, tempo)
    if spotify_dict.get('genre'):
        prompt += f" and belongs to genres such as {', '.join(spotify_dict['genre'][:2])}"
    if spotify_dict.get('tempo'):
        prompt += f" with a tempo of around {spotify_dict['tempo']} BPM"

    # Adding topics
    if topics:
        prompt += f". The lyrics often reflect topics such as {topics}"

    return prompt

In [None]:
# Apply the function to each row
df['prompt'] = df.apply(generate_prompt, axis=1)

In [None]:
df.head(10)

In [None]:

df.to_csv('processed_lyrics_sentiment_keywords.csv', index=False)