# HitPredict - Machine Learning Preprocessing
Ein Projekt für 'Grundlagen und Methoden der Informatik für Wirtschaftswissenschaften' an der Universität St.Gallen (2025)

Autoren: Ruben Cardell, Adam Bisharat, Helena Häußler, Colin Wirth

---

- In diesem Notebook wurden die Lyrics für alle Songs im Kaggle Datensatz über die LRCLIB API abgerufen.
- Hinweis: Die Kommentierung des Codes erfolgte hier nachträglich aufgrund des Umfangs teils auf Grundlage von ChatGPT.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = "drive/MyDrive/CS Gruppenarbeit/data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import des CSV von Kaggle
df = pd.read_csv(path + "spotify_data.csv", index_col=0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1159764 entries, 0 to 1473395
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159749 non-null  object 
 1   track_name        1159763 non-null  object 
 2   track_id          1159764 non-null  object 
 3   popularity        1159764 non-null  int64  
 4   year              1159764 non-null  int64  
 5   genre             1159764 non-null  object 
 6   danceability      1159764 non-null  float64
 7   energy            1159764 non-null  float64
 8   key               1159764 non-null  int64  
 9   loudness          1159764 non-null  float64
 10  mode              1159764 non-null  int64  
 11  speechiness       1159764 non-null  float64
 12  acousticness      1159764 non-null  float64
 13  instrumentalness  1159764 non-null  float64
 14  liveness          1159764 non-null  float64
 15  valence           1159764 non-null  float64
 16  tempo

In [None]:
df

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.6940,0.000000,0.1150,0.1390,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.4770,0.000014,0.0974,0.5150,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.3380,0.000050,0.0895,0.1450,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.8070,0.000000,0.0797,0.5080,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.430,0.791,6,-5.419,0,0.0302,0.0726,0.019300,0.1100,0.2170,171.864,244320,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1473391,Nicola Conte,Black Spirits,0m27F0IGHLGAWhqd6ccYst,4,2011,trip-hop,0.373,0.742,10,-6.453,0,0.0736,0.3250,0.000141,0.1590,0.5220,107.951,344013,3
1473392,Nicola Conte,Quiet Dawn,6er9p611eHEcUCU50j7D57,3,2011,trip-hop,0.516,0.675,7,-7.588,0,0.0326,0.7880,0.000129,0.1300,0.2640,119.897,285067,4
1473393,Amon Tobin,Morning Ms Candis,7jsMMqxy1tt0rH5FzYcZTQ,2,2011,trip-hop,0.491,0.440,5,-8.512,1,0.0274,0.4770,0.003130,0.0936,0.0351,100.076,214253,4
1473394,Peace Orchestra,Happy Christmas (War Is Over),77lA1InUaXztuRk2vOzD1S,0,2011,trip-hop,0.480,0.405,0,-13.343,1,0.0276,0.4310,0.000063,0.1250,0.2020,133.885,239133,3


# API Calls

In [None]:
import asyncio
import aiohttp
import pandas as pd
import urllib.parse
import nest_asyncio
from tqdm import tqdm

# Wir verwenden den nest_asyncio Patch, da Google Colab sonst keine asynchrone Ausführung erlaubt
nest_asyncio.apply()

In [None]:
async def fetch_search(session, artist_name, track_name, headers):
    """
    Fallback-Funktion, die den Such-Endpunkt der API aufruft.
    Gibt den Songtext und den Albumnamen des ersten Ergebnisses der Suchantwort zurück.
    """
    # Überprüfen, ob Künstler- oder Titelname fehlt
    if not artist_name or not track_name:
        return None, None

    # Künstler- und Titelname URL-konform kodieren
    artist_enc = urllib.parse.quote(str(artist_name))
    track_enc = urllib.parse.quote(str(track_name))

    # URL für die Suchanfrage erstellen
    url_search = f"https://lrclib.net/api/search?track_name={track_enc}&artist_name={artist_enc}"

    try:
        # Anfrage an den Such-Endpunkt senden
        async with session.get(url_search, headers=headers) as response:
            # Wenn kein erfolgreicher Statuscode, abbrechen
            if response.status != 200:
                return None, None

            # API-Antwort als JSON laden
            data = await response.json()

            # Es wird erwartet, dass die Antwort eine Liste ist
            if isinstance(data, list) and data:
                first_result = data[0]  # Erstes Ergebnis der Liste verwenden

                # Prüfen, ob der Titel ein Instrumentalstück ist
                if first_result.get("instrumental") is True:
                    lyrics = False  # Keine Liedtexte vorhanden
                else:
                    lyrics = first_result.get("plainLyrics")  # Liedtext auslesen

                # Albumnamen extrahieren
                albumName = first_result.get("albumName")

                return lyrics, albumName
            else:
                # Falls keine Ergebnisse vorliegen
                return None, None

    except Exception as e:
        # Fehler beim Abruf oder Verarbeiten der Antwort
        print(f"Fehler im Such-Endpunkt für {artist_name} - {track_name}: {e}")
        return None, None

# Quelle: Eigene vergangene Projekte und Adaption mit ChatGPT

In [None]:
async def fetch_lyrics(session, artist_name, track_name, duration_ms, sem, progress):
    """
    Ruft Songtext vom Haupt-Endpunkt der API ab.
    Greift auf den Search-Endpunkt als Fallback zurück.
    """
    # Semaphore verwenden, um gleichzeitige Anfragen zu begrenzen
    async with sem:
        # Überprüfe, ob Künstler- oder Titelname fehlt
        if not artist_name or not track_name:
            progress.update(1)  # Fortschritt um 1 erhöhen
            return None, None

        # Dauer in Sekunden umrechnen
        duration = int(duration_ms / 1000)

        # URL-kompatible Kodierung von Künstler- und Titelnamen
        artist_enc = urllib.parse.quote(str(artist_name))
        track_enc = urllib.parse.quote(str(track_name))

        # API-Endpunkt zum direkten Abrufen von Liedtext
        url_get = f"https://lrclib.net/api/get?artist_name={artist_enc}&track_name={track_enc}&duration={duration}"

        # Benutzerdefinierter User-Agent für API-Zugriff
        headers = {"User-Agent": "Spotify 1Million v1.0 (ruben@cardell.ch)"}

        try:
            # Sende Anfrage an API
            async with session.get(url_get, headers=headers) as response:
                # Wenn Status ungleich 200 ist, verwende Such-Endpunkt als Fallback
                if response.status != 200:
                    lyrics, albumName = await fetch_search(session, artist_name, track_name, headers)
                    progress.update(1)
                    return lyrics, albumName

                # JSON-Antwort der API abrufen
                data = await response.json()

                # Wenn der Titel nicht gefunden wurde (API-Code 404), verwende Fallback
                if data.get("code") == 404:
                    lyrics, albumName = await fetch_search(session, artist_name, track_name, headers)
                    progress.update(1)
                    return lyrics, albumName

                # Prüfen, ob es sich um ein Instrumentalstück handelt
                if data.get("instrumental") is True:
                    lyrics = False
                else:
                    # Songtext aus dem Feld "plainLyrics" extrahieren
                    lyrics = data.get("plainLyrics")

                # Albumname aus der Antwort entnehmen
                albumName = data.get("albumName")

                # Fortschrittsanzeige aktualisieren
                progress.update(1)

                return lyrics, albumName

        except Exception as e:
            # Fehlerausgabe bei Ausnahmen (z. B. Netzwerkfehler)
            print(f"Fehler beim Abrufen des Liedtexts für {artist_name} - {track_name}: {e}")

            # Fallback zur Suchfunktion bei Fehler
            lyrics, albumName = await fetch_search(session, artist_name, track_name, headers)
            progress.update(1)
            return lyrics, albumName

# Quelle: Eigene vergangene Projekte und Adaption mit ChatGPT

In [None]:
async def process_dataframe(df):
    """
    Verarbeitet das gesamte DataFrame asynchron.
    Fügt zwei neue Spalten hinzu: 'lyrics' und 'album_name'.
    """
    # Semaphore zur Begrenzung gleichzeitiger Anfragen (hier max. 1000 gleichzeitig)
    sem = asyncio.Semaphore(1000)

    # Asynchrone HTTP-Sitzung mit aiohttp öffnen
    async with aiohttp.ClientSession() as session:

        # Fortschrittsanzeige mit tqdm starten
        with tqdm(total=len(df), desc="Fetching Lyrics") as progress:

            # Erstelle eine Liste asynchroner Aufgaben für alle Zeilen im DataFrame
            tasks = [
                fetch_lyrics(session, row['artist_name'], row['track_name'], row['duration_ms'], sem, progress)
                for _, row in df.iterrows()
            ]

            # Warte auf die Fertigstellung aller Aufgaben
            results = await asyncio.gather(*tasks)

    # Entpacke die Ergebnisse (Tupel) in zwei neue Spalten im DataFrame
    df['lyrics'], df['album_name'] = zip(*results)

    # Gib das erweiterte DataFrame zurück
    return df

# Quelle: Eigene vergangene Projekt und Adaption mit ChatGPT

In [None]:
# Schleife über die vier Teile der CSV-Daten
for i in range(1, 5):
  print(f"Processing part {i}...")

  # Lese den entsprechenden Teil der CSV-Datei ein
  df = pd.read_csv(path + f"chunks/spotify_part{i}.csv", index_col=0)

  # Rufe Songtexte asynchron ab
  updated_df = await process_dataframe(df)

  # Speichere das aktualisierte DataFrame mit Liedtexten als neue CSV-Datei
  updated_df.to_csv(path + f"chunks/spotify_part{i}_lyrics.csv")

  print(f"Finished part {i}!")

Processing part 1...


Fetching Lyrics: 100%|██████████| 231953/231953 [17:59<00:00, 214.96it/s]


Finished part 1!
Processing part 2...


Fetching Lyrics: 100%|██████████| 231953/231953 [20:25<00:00, 189.27it/s]


Finished part 2!
Processing part 3...


Fetching Lyrics: 100%|██████████| 231953/231953 [20:40<00:00, 186.95it/s]


Finished part 3!
Processing part 4...


Fetching Lyrics: 100%|██████████| 231952/231952 [22:11<00:00, 174.16it/s]


Finished part 4!


In [None]:
# Testausgabe von Lyrics eines Songs
print(updated_df.iloc[0]["lyrics"])

Sleep will carry us
Have to mention heavy on the REM
It's the least discerning place
We could dive too

We both shouldn't last another day
Another failed invention straight to nowhere
One more you say
What causes these
It's all the better, your greed seems to carry you well

We couldn't dive too
Too
Too

We tossed our failures at the earth
Connected to a cause
Stripped away another fool
Blame it all because
We tossed our failures at the earth
Stripped away from under

So shy, crossed my heart, I sinned
With images too far apart to tell
What don't we know?
What should we know?
Slipping under is just the beginning of thirst

We couldn't dive too
Too
Too

We tossed our failures at the earth
Connected to a cause
Stripped away another fool
Blame it all because
We tossed our failures at the earth
Stripped away from under

We so forgot to mention
A waste upon the hours
We so forgot to mention
The hook that is the chorus
We so forgot to mention
A waste upon the hours
We so forgot to mention
Th

In [None]:
# Setze die Teile des Datensatzes wieder zusammen
df0 = pd.read_csv(path + "chunks/spotify_part0_lyrics.csv", index_col=0)
df1 = pd.read_csv(path + "chunks/spotify_part1_lyrics.csv", index_col=0)
df2 = pd.read_csv(path + "chunks/spotify_part2_lyrics.csv", index_col=0)
df3 = pd.read_csv(path + "chunks/spotify_part3_lyrics.csv", index_col=0)
df4 = pd.read_csv(path + "chunks/spotify_part4_lyrics.csv", index_col=0)

merged_df = pd.concat([df0, df1, df2, df3, df4], axis=0)

# Export als CSV
merged_df.to_csv(path + "spotify_data_lyrics.csv")

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1159764 entries, 0 to 1473395
Data columns (total 21 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159749 non-null  object 
 1   track_name        1159763 non-null  object 
 2   track_id          1159764 non-null  object 
 3   popularity        1159764 non-null  int64  
 4   year              1159764 non-null  int64  
 5   genre             1159764 non-null  object 
 6   danceability      1159764 non-null  float64
 7   energy            1159764 non-null  float64
 8   key               1159764 non-null  int64  
 9   loudness          1159764 non-null  float64
 10  mode              1159764 non-null  int64  
 11  speechiness       1159764 non-null  float64
 12  acousticness      1159764 non-null  float64
 13  instrumentalness  1159764 non-null  float64
 14  liveness          1159764 non-null  float64
 15  valence           1159764 non-null  float64
 16  tempo