In [1]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)




  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [2]:
import pandas as pd
from transformers import AutoTokenizer, pipeline

# Load pre-trained tokenizer and classifier (using your specific classifier)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Load the CSV files
lyrics_df = pd.read_csv("lyrics_CSV.csv")
lyrics_sum_df = pd.read_csv("Sum_lyrics.csv")

# Ensure both files have the same number of rows
if len(lyrics_df) != len(lyrics_sum_df):
    raise ValueError("The two CSV files must have the same number of rows.")

# Function to truncate lyrics to 512 tokens
def truncate_to_512_tokens(text):
    encoded_input = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    return tokenizer.decode(encoded_input['input_ids'][0], skip_special_tokens=True)

# Function to classify lyrics and return the emotion with the highest score
def get_top_emotion(text):
    result = classifier(text)[0]
    # Select the label with the highest score from the returned scores
    return max(result, key=lambda x: x['score'])['label']

# Add an Emotion column to both DataFrames
lyrics_df['Emotion'] = lyrics_df['Lyrics'].apply(lambda x: get_top_emotion(truncate_to_512_tokens(x)))
lyrics_sum_df['Emotion'] = lyrics_sum_df['Sum Lyrics'].apply(lambda x: get_top_emotion(x))

# Now compare the Emotion columns line by line
total_songs = len(lyrics_df)
matches = sum(lyrics_df['Emotion'].str.lower() == lyrics_sum_df['Emotion'].str.lower())

# Calculate the accuracy percentage
accuracy = (matches / total_songs) * 100

# Print the accuracy result
print(f"Accuracy of emotion matching between truncated and non-truncated lyrics: {accuracy:.2f}%")

# Save the updated CSVs with Emotion fields
lyrics_df.to_csv("lyrics_with_emotions_truncated.csv", index=False)
lyrics_sum_df.to_csv("lyrics_sum_with_emotions.csv", index=False)


Accuracy of emotion matching between truncated and non-truncated lyrics: 79.61%


In [3]:
import pandas as pd
from transformers import AutoTokenizer, pipeline

# Load pre-trained tokenizer and classifier
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Load the CSV files
lyrics_df = pd.read_csv("lyrics_CSV.csv")
lyrics_sum_df = pd.read_csv("Sum_lyrics.csv")

# Function to split lyrics into chunks of less than 512 tokens
def split_into_chunks(text, max_tokens=512):
    tokens = tokenizer.encode(text)
    return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True) 
             for i in range(0, len(tokens), max_tokens) if len(tokens[i:i + max_tokens]) < max_tokens]

# Function to classify and average scores
def classify_and_average(text):
    chunks = split_into_chunks(text)
    total_scores = {}
    num_chunks = len(chunks)

    for chunk in chunks:
        results = classifier(chunk)[0]
        for result in results:
            label = result['label']
            score = result['score']
            if label not in total_scores:
                total_scores[label] = 0
            total_scores[label] += score
    
    # Average scores
    for label in total_scores:
        total_scores[label] /= num_chunks
    
    # Get the emotion with the highest average score
    return max(total_scores, key=total_scores.get)

# Add an Emotion column to the lyrics DataFrame
lyrics_df['Emotion'] = lyrics_df['Lyrics'].apply(classify_and_average)

# For the second file, classify the full lyrics
lyrics_sum_df['Emotion'] = lyrics_sum_df['Sum Lyrics'].apply(lambda x: get_top_emotion(x))

# Compare the Emotion columns line by line
total_songs = len(lyrics_df)
matches = sum(lyrics_df['Emotion'].str.lower() == lyrics_sum_df['Emotion'].str.lower())

# Calculate the accuracy percentage
accuracy = (matches / total_songs) * 100

# Print the accuracy result
print(f"Accuracy of emotion matching between split and non-split lyrics: {accuracy:.2f}%")

# Save the updated CSVs with Emotion fields
lyrics_df.to_csv("lyrics_with_emotions_split.csv", index=False)
lyrics_sum_df.to_csv("lyrics_sum_with_emotions.csv", index=False)


  return torch.load(checkpoint_file, map_location=map_location)
Token indices sequence length is longer than the specified maximum sequence length for this model (1293 > 1024). Running this sequence through the model will result in indexing errors


Accuracy of emotion matching between split and non-split lyrics: 76.34%


In [14]:
import pandas as pd
from transformers import pipeline

# Load the classifier
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Load the lyrics CSV
sum_lyrics_df = pd.read_csv('Sum_lyrics.csv')

# Set the limit for classification
limit = 0.5

# Prepare emotion-specific CSV files
emotions = ['anger', 'disgust', 'fear', 'surprise', 'joy', 'sadness', 'neutral']
for emotion in emotions:
    with open(f"{emotion}.csv", 'w') as f:
        f.write("Artist Name,Song Name,Sum Lyrics\n")

# Prepare a buffer for each artist
artist_buffers = {}
for index, row in sum_lyrics_df.iterrows():
    artist_name = row['Artist Name']
    if artist_name not in artist_buffers:
        artist_buffers[artist_name] = []

    # Classify the song
    try:
        result = classifier(row['Sum Lyrics'])
        # Get the emotion with the maximum score
        max_emotion = max(result[0], key=lambda x: x['score'])
        max_emotion_name = max_emotion['label'].lower()

        # Check against the limit
        if max_emotion['score'] >= limit:
            # Save to the corresponding emotion CSV
            with open(f"{max_emotion_name}.csv", 'a') as f:
                f.write(f"{artist_name},{row['Song Name']},{row['Sum Lyrics']}\n")
            print(f"{artist_name} {row['Song Name']} classified as emotion csv: {max_emotion_name}")
        else:
            # Save to the artist buffer
            artist_buffers[artist_name].append([row['Song Name'], row['Sum Lyrics'], max_emotion_name])
            print(f"{artist_name} {row['Song Name']} classified as buffer csv: {max_emotion_name}")

    except Exception as e:
        print(f"Error processing {row['Song Name']} by {artist_name}: {e}")

# Save the buffer files for each artist
for artist_name, songs in artist_buffers.items():
    if songs:  # Only create a buffer file if there are songs
        with open(f"{artist_name}_buffer.csv", 'w') as f:
            f.write("Song Name,Sum Lyrics,emotion\n")
            for song in songs:
                f.write(f"{song[0]},{song[1]},{song[2]}\n")


  return torch.load(checkpoint_file, map_location=map_location)


Ed Sheeran Legacy classified as emotion csv: neutral
Ed Sheeran Still Don't Give A Fuck classified as buffer csv: anger
Ed Sheeran Asshole classified as emotion csv: anger
Ed Sheeran Cum On Everybody classified as buffer csv: anger
Ed Sheeran W.T.P. classified as emotion csv: neutral
Ed Sheeran Beautiful classified as emotion csv: sadness
Ed Sheeran Medicine Ball classified as buffer csv: disgust
Ed Sheeran My Mom classified as emotion csv: neutral
Ed Sheeran So Far... classified as emotion csv: surprise
Ed Sheeran Yellow Brick Road classified as emotion csv: neutral
Ed Sheeran When I'm Gone classified as emotion csv: sadness
Ed Sheeran Rhyme Or Reason classified as buffer csv: surprise
Ed Sheeran My Fault classified as buffer csv: surprise
Ed Sheeran Hello classified as emotion csv: anger
Ed Sheeran Underground classified as buffer csv: neutral
Ed Sheeran Hell Breaks Loose classified as buffer csv: neutral
Ed Sheeran Stan classified as emotion csv: surprise
Ed Sheeran Cleanin Out My C

In [1]:
# import pandas as pd
# from transformers import pipeline
# import os

# # Initialize the classifier
# classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# # Define the limit for classification
# limit_new = 0.5

# # List of artist buffer CSV files
# buffer_files = [
#     "Ariana Grande_buffer.csv",
#     "Coldplay_buffer.csv",
#     "Ed Sheeran_buffer.csv",
#     "Eminem_buffer.csv",
#     "Lana Del Rey_buffer.csv",
#     "Maroon 5_buffer.csv",
#     "Michael Jackson_buffer.csv",
#     "One Direction_buffer.csv",
#     "Rihanna_buffer.csv",
#     "Taylor Swift_buffer.csv"
# ]

# # List of emotions and corresponding CSV names
# emotions = ['anger', 'disgust', 'fear', 'surprise', 'joy', 'sadness', 'neutral']
# emotion_csv_files = {emotion: f"{emotion}.csv" for emotion in emotions}

# # Function to classify and update the buffer
# def classify_and_update_buffer():
#     for buffer_file in buffer_files:
#         # Load the buffer CSV
#         buffer_df = pd.read_csv(buffer_file)
        
#         # Strip whitespace from column names
#         buffer_df.columns = buffer_df.columns.str.strip()
        
#         # Debugging: print the columns of the DataFrame
#         print(f"Columns in {buffer_file}: {buffer_df.columns.tolist()}")
        
#         # Check if the buffer CSV is empty
#         if buffer_df.empty:
#             print(f"{buffer_file} is empty. Skipping.")
#             continue
        
#         # Process each song in the buffer
#         for index, row in buffer_df.iterrows():
#             try:
#                 song_name = row['Song Name']  # Accessing column names
#                 sum_lyrics = row['Sum Lyrics']
#                 artist_name = buffer_file.split('_')[0]  # Extract artist name from buffer file name
                
#                 # Classify the song
#                 results = classifier(sum_lyrics)
#                 max_score_info = max(results, key=lambda x: x['score'])
#                 emotion = max_score_info['label']
#                 score = max_score_info['score']
                
#                 # Check if the score meets the threshold
#                 if score >= limit_new:
#                     # Save to corresponding emotion CSV
#                     emotion_df = pd.DataFrame([[artist_name, song_name, sum_lyrics]], columns=["Artist Name", "Song Name", "Sum Lyrics"])
#                     emotion_df.to_csv(emotion_csv_files[emotion], mode='a', header=not os.path.exists(emotion_csv_files[emotion]), index=False)

#                     # Print classification message
#                     print(f"{artist_name} {song_name} classified as {emotion}")

#                     # Remove the classified song from the buffer DataFrame
#                     buffer_df = buffer_df.drop(index)

#             except KeyError as e:
#                 print(f"KeyError: {e} in file {buffer_file} for index {index}. Row data: {row}")
#             except Exception as e:
#                 print(f"Error processing {buffer_file}: {e}")

#         # Save the updated buffer CSV back
#         buffer_df.to_csv(buffer_file, index=False)

# # Run the classification and update process
# classify_and_update_buffer()


In [4]:
import pandas as pd
import os

# Define the emotions
emotions = ['anger', 'disgust', 'fear', 'surprise', 'joy', 'sadness', 'neutral']

# Dictionary to hold the counts of emotions for each artist
artist_emotion_counts = {}

# Process each emotion CSV file
for emotion in emotions:
    emotion_file = f"{emotion}.csv"
    if os.path.exists(emotion_file):
        df = pd.read_csv(emotion_file)
        for _, row in df.iterrows():
            artist_name = row['Artist Name']
            if artist_name not in artist_emotion_counts:
                artist_emotion_counts[artist_name] = {emotion: 0 for emotion in emotions}
            # Increment the count of this emotion for the artist
            artist_emotion_counts[artist_name][emotion] += 1

# Print statistics for each artist
for artist_name, emotion_count in artist_emotion_counts.items():
    print(f"Statistics for {artist_name}:")
    for emotion, count in emotion_count.items():
        print(f"  {emotion.capitalize()}: {count}")
    print("\n" + "-"*40 + "\n")


Statistics for Ed Sheeran:
  Anger: 10
  Disgust: 4
  Fear: 5
  Surprise: 3
  Joy: 1
  Sadness: 3
  Neutral: 5

----------------------------------------

Statistics for Taylor Swift:
  Anger: 2
  Disgust: 0
  Fear: 9
  Surprise: 4
  Joy: 0
  Sadness: 11
  Neutral: 2

----------------------------------------

Statistics for Rihanna:
  Anger: 4
  Disgust: 0
  Fear: 12
  Surprise: 3
  Joy: 2
  Sadness: 24
  Neutral: 6

----------------------------------------

Statistics for Ariana Grande:
  Anger: 1
  Disgust: 0
  Fear: 1
  Surprise: 0
  Joy: 1
  Sadness: 4
  Neutral: 2

----------------------------------------

Statistics for Lana Del Rey:
  Anger: 3
  Disgust: 0
  Fear: 6
  Surprise: 3
  Joy: 4
  Sadness: 6
  Neutral: 0

----------------------------------------

Statistics for Maroon 5:
  Anger: 4
  Disgust: 0
  Fear: 22
  Surprise: 4
  Joy: 0
  Sadness: 11
  Neutral: 1

----------------------------------------

Statistics for Michael Jackson:
  Anger: 7
  Disgust: 2
  Fear: 31
  Surpr

In [2]:
import pandas as pd
from transformers import pipeline

# Initialize the emotion classifier
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Artist emotions and factors
artist_factors = {
    "Ariana Grande": {"emotion": "joy", "factor": 1.3},
    "Coldplay": {"emotion": "sadness", "factor": 1.4},
    "Ed Sheeran": {"emotion": "neutral", "factor": 1.1},
    "Eminem": {"emotion": "anger", "factor": 1.5},
    "Lana Del Rey": {"emotion": "sadness", "factor": 1.6},
    "Maroon 5": {"emotion": "joy", "factor": 1.2},
    "Michael Jackson": {"emotion": "surprise", "factor": 1.4},
    "One Direction": {"emotion": "joy", "factor": 1.2},
    "Rihanna": {"emotion": "disgust", "factor": 1.3},
    "Taylor Swift": {"emotion": "fear", "factor": 1.3}
}

# Emotion list for reference
emotions = ['anger', 'disgust', 'fear', 'surprise', 'joy', 'sadness', 'neutral']

# Function to classify the songs for an artist
def classify_artist_songs(artist_name, buffer_file):
    # Read the artist buffer file
    buffer_df = pd.read_csv(buffer_file)
    
    # Prepare to store rows for classified songs
    classified_songs = []

    # Get the specific emotion and factor for the artist
    emotion_info = artist_factors[artist_name]
    artist_emotion = emotion_info["emotion"]
    artist_factor = emotion_info["factor"]

    # Iterate through each song in the buffer file
    for index, row in buffer_df.iterrows():
        song_name = row["Song Name"]
        sum_lyrics = row["Sum Lyrics"]

        # Classify the lyrics using the pipeline
        scores = classifier(sum_lyrics)[0]
        
        # Multiply the designated emotion's score by the factor
        for score_dict in scores:
            if score_dict["label"] == artist_emotion:
                score_dict["score"] *= artist_factor
        
        # Find the emotion with the max score
        max_score_emotion = max(scores, key=lambda x: x["score"])["label"]

        # Print classification details
        print(f"{artist_name} - {song_name} classified as {max_score_emotion}")

        # Append to the classified songs list (for saving later)
        classified_songs.append({
            "Artist Name": artist_name,
            "Song Name": song_name,
            "Sum Lyrics": sum_lyrics,
            "Emotion": max_score_emotion
        })

    # Save classified songs to the respective emotion CSV
    for emotion in emotions:
        emotion_df = pd.DataFrame([song for song in classified_songs if song["Emotion"] == emotion])
        if not emotion_df.empty:
            emotion_df.to_csv(f"{emotion}.csv", mode='a', index=False, header=False)

    # Save updated buffer CSV (removing classified songs)
    buffer_df = buffer_df[~buffer_df["Song Name"].isin([song["Song Name"] for song in classified_songs])]
    buffer_df.to_csv(buffer_file, index=False)

# Loop through all artists and classify their buffer songs
for artist, data in artist_factors.items():
    buffer_file = f"{artist}_buffer.csv"
    classify_artist_songs(artist, buffer_file)


  return torch.load(checkpoint_file, map_location=map_location)


Ed Sheeran - Still Don't Give A Fuck classified as anger
Ed Sheeran - Cum On Everybody classified as sadness
Ed Sheeran - Medicine Ball classified as neutral
Ed Sheeran - Rhyme Or Reason classified as surprise
Ed Sheeran - My Fault classified as surprise
Ed Sheeran - Underground classified as neutral
Ed Sheeran - Hell Breaks Loose classified as neutral
Ed Sheeran - Trapped classified as anger
Ed Sheeran - Drug Ballad classified as disgust
Ed Sheeran - Desperation classified as surprise
Ed Sheeran - Bad Guy classified as sadness
Ed Sheeran - Sing For The Moment classified as disgust
Ed Sheeran - Puke classified as disgust
Ed Sheeran - Marshall Mathers classified as anger
Ed Sheeran - You Don't Know classified as fear
Ed Sheeran - Evil Deeds classified as neutral
Ed Sheeran - Buffalo Bill classified as surprise
Ed Sheeran - Say Goodbye Hollywood classified as neutral
Ed Sheeran - Who Knew classified as surprise
Ed Sheeran - Forever classified as anger
Ed Sheeran - Old Time's Sake classif