In [4]:
import pandas as pd
from transformers import pipeline, AutoTokenizer

# Load the summarization model and tokenizer
model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name, framework="pt")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the original CSV file
df = pd.read_csv("lyrics_CSV.csv")  # Replace 'lyrics_CSV.csv' with your CSV file path

# Define the summarization function
def summarize_lyrics(lyrics, song_name):
    if pd.isna(lyrics) or not lyrics.strip():  # Check for NaN or empty lyrics
        print(f"{song_name} lyrics are empty or NaN, added directly.")
        return lyrics
    
    # Tokenize the lyrics
    tokens = tokenizer(lyrics, return_tensors='pt', truncation=False)["input_ids"][0]
    token_count = len(tokens)

    # If initial tokens are less than 512, return the original lyrics
    if token_count < 512:
        print(f"{song_name} lyrics are less than 512 tokens, added directly.")
        return lyrics
    
    # If token count exceeds 1024, truncate before summarization
    if token_count > 1024:
        tokens = tokens[:1024]  # Keep only the first 1024 tokens
        lyrics = tokenizer.decode(tokens, skip_special_tokens=True)
        print(f"{song_name} lyrics truncated to 1024 tokens.")

    # Summarize the lyrics
    try:
        summary = summarizer(lyrics, max_length=512, min_length=0, do_sample=False)
        print(f"{song_name} lyrics summarized")
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing {song_name}: {e}")
        return lyrics  # Return original lyrics if there's an error

# Initialize a counter for song names
for idx, row in df.iterrows():
    df.at[idx, 'Summarized_Lyrics'] = summarize_lyrics(row['Lyrics'], row['Song Name'])

# Save the dataframe with summarized lyrics to a new CSV file
df.to_csv("Lyrics_Sum.csv", index=False)


Legacy lyrics summarized


Token indices sequence length is longer than the specified maximum sequence length for this model (1293 > 1024). Running this sequence through the model will result in indexing errors


Still Don't Give A Fuck lyrics summarized
Asshole lyrics truncated to 1024 tokens.
Asshole lyrics summarized


Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors


Cum On Everybody lyrics summarized
W.T.P. lyrics truncated to 1024 tokens.
Error summarizing W.T.P.: index out of range in self
Beautiful lyrics truncated to 1024 tokens.
Error summarizing Beautiful: index out of range in self


KeyboardInterrupt: 

In [5]:
import pandas as pd
import string
import re
from transformers import pipeline, AutoTokenizer

# Load the summarization model and tokenizer
model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name, framework="pt")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to clean text
def clean_text(text):
    # Remove punctuation using string.punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra spaces
    text = re.sub(' +', ' ', text)  # Replace multiple spaces with a single space
    
    # Remove any remaining symbols (e.g., #, $, %, &)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    
    # Strip leading/trailing spaces
    text = text.strip()
    
    return text

# Load the original CSV file
df = pd.read_csv("lyrics_CSV.csv")  # Replace 'lyrics_CSV.csv' with your CSV file path

# Define the summarization function
def summarize_lyrics(lyrics, song_name):
    if pd.isna(lyrics) or not lyrics.strip():  # Check for NaN or empty lyrics
        print(f"{song_name} lyrics are empty or NaN, added directly.")
        return lyrics
    
    # Clean the lyrics
    cleaned_lyrics = clean_text(lyrics)
    
    # Tokenize the cleaned lyrics
    tokens = tokenizer(cleaned_lyrics, return_tensors='pt', truncation=False)["input_ids"][0]
    token_count = len(tokens)

    # If initial tokens are less than 512, return the original cleaned lyrics
    if token_count < 512:
        print(f"{song_name} lyrics are less than 512 tokens, added directly.")
        return cleaned_lyrics
    
    # If token count exceeds 1024, truncate before summarization
    if token_count > 1024:
        tokens = tokens[:1024]  # Keep only the first 1024 tokens
        cleaned_lyrics = tokenizer.decode(tokens, skip_special_tokens=True)
        print(f"{song_name} lyrics truncated to 1024 tokens.")

    # Summarize the cleaned lyrics
    try:
        summary = summarizer(cleaned_lyrics, max_length=512, min_length=0, do_sample=False)
        print(f"{song_name} lyrics summarized")
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing {song_name}: {e}")
        return cleaned_lyrics  # Return original cleaned lyrics if there's an error

# Initialize a counter for song names
for idx, row in df.iterrows():
    df.at[idx, 'Summarized_Lyrics'] = summarize_lyrics(row['Lyrics'], row['Song Name'])

# Save the dataframe with summarized lyrics to a new CSV file
df.to_csv("Lyrics_Sum.csv", index=False)


Legacy lyrics summarized


Token indices sequence length is longer than the specified maximum sequence length for this model (1174 > 1024). Running this sequence through the model will result in indexing errors


Still Don't Give A Fuck lyrics summarized
Asshole lyrics truncated to 1024 tokens.
Asshole lyrics summarized
Cum On Everybody lyrics summarized


Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors


W.T.P. lyrics summarized
Beautiful lyrics truncated to 1024 tokens.
Error summarizing Beautiful: index out of range in self
Medicine Ball lyrics summarized
My Mom lyrics summarized
So Far... lyrics truncated to 1024 tokens.
Error summarizing So Far...: index out of range in self
Yellow Brick Road lyrics truncated to 1024 tokens.
Error summarizing Yellow Brick Road: index out of range in self
When I'm Gone lyrics summarized
Rhyme Or Reason lyrics summarized
My Fault lyrics summarized
Hello lyrics summarized
Underground lyrics truncated to 1024 tokens.
Error summarizing Underground: index out of range in self
Hell Breaks Loose lyrics summarized
Stan lyrics truncated to 1024 tokens.
Error summarizing Stan: index out of range in self
Cleanin Out My Closet lyrics summarized
We As Americans lyrics summarized
Almost Famous lyrics truncated to 1024 tokens.
Error summarizing Almost Famous: index out of range in self
Just Lose It lyrics summarized
Trapped lyrics are less than 512 tokens, added d

KeyboardInterrupt: 

In [1]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Load the CSV file
df = pd.read_csv('lyrics_CSV.csv')

# Initialize lists to store processed data
artist_names = []
song_names = []
sum_lyrics = []

# Function to get the token count of the lyrics
def get_token_count(text):
    tokens = tokenizer.encode(text, return_tensors="pt")
    return tokens.shape[1]

# Function to summarize lyrics
def summarize_lyrics(lyrics):
    inputs = tokenizer.encode(lyrics, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=512, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    artist = row['Artist Name']
    song = row['Song Name']
    lyrics = row['Lyrics']
    
    # Get token count of the lyrics
    token_count = get_token_count(lyrics)
    
    if token_count < 512:
        # If the lyrics have less than 512 tokens, keep it as is
        summary = lyrics
        method = "as it is"
    elif token_count > 1024:
        # If the lyrics have more than 1024 tokens, truncate to 1024 and then summarize
        truncated_lyrics = tokenizer.decode(tokenizer.encode(lyrics, max_length=1024, truncation=True))
        summary = summarize_lyrics(truncated_lyrics)
        method = "truncate"
    else:
        # If the lyrics have between 512 and 1024 tokens, summarize directly
        summary = summarize_lyrics(lyrics)
        method = "through model"
    
    # Append the processed data to lists
    artist_names.append(artist)
    song_names.append(song)
    sum_lyrics.append(summary)
    
    # Print the processing status
    print(f"{song} summarized with {method}")

# Create a new DataFrame for summarized lyrics
sum_df = pd.DataFrame({
    'Artist Name': artist_names,
    'Song Name': song_names,
    'Sum Lyrics': sum_lyrics
})

# Save the summarized data to a new CSV file
sum_df.to_csv('Sum_lyrics.csv', index=False)

print("Summarization process completed and saved to Sum_lyrics.csv")


  torch.utils._pytree._register_pytree_node(



Legacy summarized with through model


Token indices sequence length is longer than the specified maximum sequence length for this model (1293 > 1024). Running this sequence through the model will result in indexing errors


Still Don't Give A Fuck summarized with through model
Asshole summarized with truncate
Cum On Everybody summarized with through model
W.T.P. summarized with truncate
Beautiful summarized with truncate
Medicine Ball summarized with through model
My Mom summarized with truncate
So Far... summarized with truncate
Yellow Brick Road summarized with truncate
When I'm Gone summarized with truncate
Rhyme Or Reason summarized with truncate
My Fault summarized with through model
Hello summarized with through model
Underground summarized with truncate
Hell Breaks Loose summarized with through model
Stan summarized with truncate
Cleanin Out My Closet summarized with through model
We As Americans summarized with through model
Almost Famous summarized with truncate
Just Lose It summarized with through model
Trapped summarized with as it is
When The Music Stops summarized with truncate
Drug Ballad summarized with truncate
My Dad's Gone Crazy summarized with truncate
Desperation summarized with throug