## General Import and Setup

In [45]:
import pandas as pd
import plotly.express as px
from collections import Counter
import string
import spacy
from tabulate import tabulate
from textblob_de import TextBlobDE
from stopwordsiso import stopwords

In [46]:
custom_stops = ["'ne", "ne", "komm", "...", "lass", "yeah", "sag", "einfach", "weiß", "ah", "ey", "all", "mach", "bleibt", "heut", "nem"]

In [47]:
nlp = spacy.load("de_core_news_sm")
data = pd.read_csv('../data/raw/songs_complete_final.csv')
data.head()

Unnamed: 0,artist,artist_id,album,album_id,release_date,title,full_title,song_id,lyrics,release_year,weekday,genre,genre_cat,word_count
0,1099,209826,10999,831505,2021-10-29,INTRO (10999),INTRO (10999) by King Khalil,7337100,"Powpow Dicka, das kein Rap mehr, das ist Kind...",2021,Friday,post-rock,Rock,60
1,1099,209826,10999,831505,2021-10-29,GIB IHM,GIB IHM by King Khalil & AK 33,7337102,"Gefährliche, gefährliche KiKiKiKi Gefährliche ...",2021,Friday,post-rock,Rock,92
2,1099,209826,KING KONG,546439,2020-01-31,BUNDESWEIT,BUNDESWEIT by King Khalil (Ft. Fler),4532899,Aus meiner Stadt fliegen Leuchtclips und Bresl...,2020,Friday,post-rock,Rock,113
3,1099,209826,KING KONG,546439,2020-01-31,HOLLANDA,HOLLANDA by King Khalil (Ft. Mert),5189158,Because youre so sweet You lift up my heart An...,2020,Friday,post-rock,Rock,80
4,1099,209826,KING KONG,546439,2020-01-31,MOON,MOON by King Khalil & Lil Lano,5109178,. Liquid Swords GZA Actual . souljaboytellem...,2020,Friday,post-rock,Rock,233


### Checking the Vocabulary of the Artists

In [48]:
grouped_data = data.groupby("artist")

tokenized_words = grouped_data["lyrics"].apply(lambda x: ' '.join(x).split())

unique_word_counts = tokenized_words.apply(lambda x: len(x))

num_songs = grouped_data.size()


df = pd.DataFrame({'Interpret': unique_word_counts.index, 'Word Count': unique_word_counts, 'Number of Songs': num_songs})

fig = px.scatter(df, x='Word Count', y='Number of Songs', hover_data=['Interpret'], title='Number of Songs vs. Word Count')

fig.show()

### Checking the 10 most used words for each Genre

Anmerkung Stand 31.05: Muss mit custom Stopwords gearbeitet werden, sowohl sapcy als auch Stopwordsiso liefern schelchte ergebnisse. Whitespace dafür fixed.

In [49]:
def filter_stopwords(tokens):
    german_stopwords = set(stopwords(["de"]))
    full_Stopwords = list(german_stopwords) + custom_stops
    filtered_tokens = []
    for token in tokens:
        if token.lower_ not in full_Stopwords and token.text not in string.punctuation and not token.is_space:
            filtered_tokens.append(token.lower_)
    return filtered_tokens

In [50]:
data["filtered_lyrics"] = data["lyrics"].apply(lambda x: filter_stopwords(nlp(x)))

In [51]:
data["word_counts"] = data["filtered_lyrics"].apply(lambda x: Counter(set(x)))

In [52]:
grouped_data = data.groupby("genre_cat")

In [54]:
genre_word_counts = {}
for genre_cat, group in grouped_data:
    word_counts = group["word_counts"].sum()
    genre_word_counts[genre_cat] = word_counts

top_10_words_per_genre = {}
for genre, word_counts in genre_word_counts.items():
    # Exclude whitespace from the top words list
    word_counts.pop('', None)
    # Sort the words based on song count only
    top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    top_10_words_per_genre[genre] = top_words

for genre, top_words in top_10_words_per_genre.items():
    print("Genre:", genre)
    df = pd.DataFrame(top_words, columns=["Word", "Song Count"])
    print(df)
    print()

Genre: Pop
    Word  Song Count
0  leben         700
1   welt         582
2  liebe         506
3  nacht         485
4   herz         436
5  augen         352
6    seh         330
7   sehn         324
8   egal         323
9   kopf         320

Genre: Rap
     Word  Song Count
0   leben        2524
1    welt        1594
2    geld        1568
3    fick        1444
4     geh        1437
5    kopf        1417
6  scheiß        1367
7     nem        1318
8    raus        1288
9    egal        1288

Genre: Rock
    Word  Song Count
0  leben         588
1   welt         549
2  nacht         349
3   herz         306
4  liebe         293
5  augen         259
6  sehen         247
7   egal         234
8   raus         223
9  schön         222

Genre: Schlager
     Word  Song Count
0   nacht         441
1   liebe         439
2   leben         386
3    herz         369
4    welt         359
5  himmel         197
6   glück         192
7   augen         192
8    gehn         191
9    sehn         167



In [55]:
### Example on how to plot the data, couldnt really find anything prettier (didnt search that long tho)
table_data = []
headers = list(top_10_words_per_genre.keys())
for i in range(10):
    row_data = [top_10_words_per_genre[genre][i] for genre in top_10_words_per_genre]
    table_data.append(row_data)

table = tabulate(table_data, headers, tablefmt="pretty")
print(table)
# nicht word count sondern songs
# wörter highlighten (genre spezifisch // überall vorkommend)

+----------------+------------------+----------------+-----------------+
|      Pop       |       Rap        |      Rock      |    Schlager     |
+----------------+------------------+----------------+-----------------+
| ('leben', 700) | ('leben', 2524)  | ('leben', 588) | ('nacht', 441)  |
| ('welt', 582)  |  ('welt', 1594)  | ('welt', 549)  | ('liebe', 439)  |
| ('liebe', 506) |  ('geld', 1568)  | ('nacht', 349) | ('leben', 386)  |
| ('nacht', 485) |  ('fick', 1444)  | ('herz', 306)  |  ('herz', 369)  |
| ('herz', 436)  |  ('geh', 1437)   | ('liebe', 293) |  ('welt', 359)  |
| ('augen', 352) |  ('kopf', 1417)  | ('augen', 259) | ('himmel', 197) |
|  ('seh', 330)  | ('scheiß', 1367) | ('sehen', 247) | ('glück', 192)  |
| ('sehn', 324)  |  ('nem', 1318)   | ('egal', 234)  | ('augen', 192)  |
| ('egal', 323)  |  ('raus', 1288)  | ('raus', 223)  |  ('gehn', 191)  |
| ('kopf', 320)  |  ('egal', 1288)  | ('schön', 222) |  ('sehn', 167)  |
+----------------+------------------+--------------

### Sentiment Analysis of a Genre

In [44]:
genre_data = data[data["genre"] == "Rap"]

sentiments = []
for lyric in genre_data["lyrics"]:
    blob = TextBlobDE(lyric)
    sentiment = blob.sentiment
    sentiments.append((lyric, sentiment.polarity, sentiment.subjectivity))

sentiments_df = pd.DataFrame(sentiments, columns=["Lyric", "Polarity", "Subjectivity"])
print(sentiments_df.head())

                                               Lyric  Polarity  Subjectivity
0   Okay, Ansage, Freundchen, ich warne dich Ey, ...  0.067500      0.100000
1    Ey Leute, passt doch auf, ihr lebt in Gefahr...  0.115972      0.006944
2   Okay, Ellen lang und elegant Keule Mutterfick... -0.031250      0.015625
3   Hallihallöchen, ist lange her Schatz Ich vers...  0.048352      0.000000
4    Ich steh auf am Kleister riechen und gleiche... -0.054167      0.144444


In [29]:
def analyze_sentiment(data, genre):
    genre_data = data[data["genre"] == genre]

    sentiments = []
    lyrics_sentiments = []

    for lyric in genre_data["lyrics"]:
        blob = TextBlobDE(lyric)
        sentiment = blob.sentiment.polarity
        sentiments.append(sentiment)
        lyrics_sentiments.append((lyric, sentiment))

    avg_polarity = sum(sentiments) / len(sentiments)
    #lyrics_sentiments.sort(key=lambda x: x[1], reverse=True)
    #best_lyric = lyrics_sentiments[0][0]
    #worst_lyric = lyrics_sentiments[-1][0]

    return avg_polarity#, best_lyric, worst_lyric

grouped_data = data.groupby("genre")
for genre, group in grouped_data:
    avg_polarity = analyze_sentiment(data, genre)
    #avg_polarity, best_lyric, worst_lyric = analyze_sentiment(data, genre)
    print("Genre:", genre)
    print("Average Polarity:", avg_polarity)
    #print("Best Polarity Lyric:\n", best_lyric)
    #print("Worst Polarity Lyric:\n", worst_lyric)
    print()

# Nicht nur Avarage ausgeben, inklusive Perzentile -> Boxplot

Genre: Pop
Average Polarity: 0.1296460012286485

Genre: Rap
Average Polarity: 0.07332280483746088

Genre: Rock
Average Polarity: 0.0995539542870728

Genre: Schlager
Average Polarity: 0.1397368378330272

