Libraries Used:

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import lyricsgenius
import os
from dotenv import load_dotenv
import re

Current working directory, or change directory.

In [2]:
# Print the current working directory
print("Current Working Directory:", os.getcwd())

# List all files and folders in the current directory
os.system("ls -a")

# (Optional) If you need to change to your project folder every time:
#os.chdir("your/directory/name/here")  # Uncomment and update path
# print("New Working Directory:", os.getcwd())  # Confirm directory change

Current Working Directory: /home/colton_baker/repos/chatbot


.
..
.env
.gitignore
chatbot.ipynb
genius_lyrics.csv


0

### Preprocessing:

Genius API:

In [3]:
# Load environment variables from .env file
load_dotenv()

# Retrieve API key
GENIUS_ACCESS_TOKEN = os.getenv("GENIUS_ACCESS_TOKEN")
if not GENIUS_ACCESS_TOKEN:
    raise ValueError("Genius Access Token not found! Check your .env file.")

# Initialize Genius API
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)
genius.skip_non_songs = True  # Avoids non-lyrical tracks like interviews
genius.remove_section_headers = True  # Removes [Chorus], [Verse] from lyrics
genius.verbose = False  # Reduces console output clutter

def clean_lyrics(lyrics):
    """Removes special characters and extra whitespace from lyrics."""
    if lyrics:
        lyrics = re.sub(r"\[.*?\]", "", lyrics)  # Remove metadata like [Chorus]
        lyrics = re.sub(r"\n{2,}", "\n", lyrics).strip()  # Remove extra newlines
    return lyrics
# change max songs as needed
def get_available_songs(artist_name):
    """Fetches the total number of songs available for an artist."""
    artist = genius.search_artist(artist_name, max_songs=89, sort="popularity")
    if artist:
        return len(artist.songs)  # Returns total available songs
    return 0

def get_lyrics(artist_name, num_songs=None):
    """Fetches lyrics for an artist with an option to adjust the number of songs."""
    
    total_songs = get_available_songs(artist_name)
    
    if total_songs == 0:
        print(f"Artist '{artist_name}' not found or has no available songs.")
        return None
    
    print(f"Artist '{artist_name}' has {total_songs} available songs.")

    # If num_songs is not specified or exceeds available, fetch all available songs
    if num_songs is None or num_songs > total_songs:
        num_songs = total_songs
        print(f"Fetching all available songs ({num_songs})")
    else:
        print(f"Fetching {num_songs} songs.")

    artist = genius.search_artist(artist_name, max_songs=num_songs, sort="popularity")
    
    if artist:
        lyrics_data = [{"Song Title": song.title, "Lyrics": clean_lyrics(song.lyrics)} for song in artist.songs]
        return lyrics_data
    else:
        print("Failed to retrieve songs from Genius API.")
        return None

# Example usage
artist_name = "Nickelback"
num_songs = None  # Set to None to fetch all available songs

lyrics = get_lyrics(artist_name, num_songs=num_songs)

# Save to CSV only if lyrics were retrieved
if lyrics:
    df = pd.DataFrame(lyrics)
    df.to_csv("genius_lyrics.csv", index=False, encoding="utf-8")
    print(f"Lyrics saved to genius_lyrics.csv ({len(df)} songs).")
else:
    print("No lyrics were saved.")


Artist 'Nickelback' has 89 available songs.
Fetching all available songs (89)
Lyrics saved to genius_lyrics.csv (89 songs).


Web Scraping and HTML Parsing

In [12]:
def fetch_album_page(artist, album):
    """Fetches the HTML content of a specific album page on Genius."""
    artist_formatted = artist.replace(" ", "-")
    album_formatted = album.replace(" ", "-")
    url = f"https://genius.com/albums/{artist_formatted}/{album_formatted}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve page for {artist} - {album}")
        return None
fetch_album_page("Blink-182", "Dude Ranch")



'\n\n<!DOCTYPE html>\n<html class="snarly song_stories_public_launch--enabled react_forums--disabled report_abuse--disabled" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" lang="en" xml:lang="en">\n  <head>\n    <base target=\'_top\' href="//genius.com/">\n\n    <script type="text/javascript">\n//<![CDATA[\n\n  var _sf_startpt=(new Date()).getTime();\n  if (window.performance && performance.mark) {\n    window.performance.mark(\'parse_start\');\n  }\n\n//]]>\n</script>\n\n<title>blink-182 - Dude Ranch Lyrics and Tracklist | Genius</title>\n\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<meta content=\'width=device-width,initial-scale=1\' name=\'viewport\'>\n\n  <meta name="apple-itunes-app" content="app-id=709482991">\n\n<link href="https://assets.genius.com/images/apple-touch-icon.png?1741965561" rel="apple-touch-icon" />\n\n\n  \n\n  <link href="https://assets.genius.com/images/apple-touch-icon.png?1741965561" rel="apple-to