## General Import and Setup

In [13]:
import pandas as pd
import plotly.express as px
import spacy
from collections import Counter
import string
from tabulate import tabulate
from textblob_de import TextBlobDE
from stopwordsiso import stopwords

In [4]:
nlp = spacy.load("de_core_news_sm")
data = pd.read_csv('../data/raw/dataset_cleaned_file_v7.csv')
data.head()

Unnamed: 0,genre,interpret,album,release_year,songtitle,decade,song_id,full_title,release_date_components_raw,release_date_for_display,stats_raw,lyrics,weekday,word_count
0,,Ardian Bujupi,,2023,Skena Koh,0,9142376,Skena Koh by Ardian Bujupi,"{'year': 2023, 'month': 5, 'day': 19}",2023-05-19 00:00:00,"{'unreviewed_annotations': 0, 'hot': False}","Ich weiß genau, wie es anfing Hab' mich noch...",Friday,175
1,,Ardian Bujupi,,2023,Maria,0,9014456,Maria by Ardian Bujupi,"{'year': 2023, 'month': 4, 'day': 14}",2023-04-14 00:00:00,"{'unreviewed_annotations': 0, 'hot': False}",Ey-yeah-yeah-yeah-yeah-yeah Ey-ja-ja-ja-ja-j...,Friday,98
2,,Ardian Bujupi,,2023,3 Panamera,0,8881172,3 Panamera by Ardian Bujupi,"{'year': 2023, 'month': 3, 'day': 3}",2023-03-03 00:00:00,"{'unreviewed_annotations': 0, 'hot': False}","Blick' durch die Cartier, nice Ferrari, Ital...",Friday,214
3,,Ardian Bujupi,,2022,Atlantik,0,8625082,Atlantik by Ardian Bujupi,"{'year': 2022, 'month': 12, 'day': 9}",2022-12-09 00:00:00,"{'unreviewed_annotations': 0, 'hot': False}",Ruf nur einmal an Und ich fliege für dich üb...,Friday,129
4,,Ardian Bujupi,,2022,K.i.m.B,0,8146443,K.i.m.B by Ardian Bujupi,"{'year': 2022, 'month': 7, 'day': 1}",2022-07-01 00:00:00,"{'unreviewed_annotations': 0, 'hot': False}","Komm in mein'n Block (Ey, ey) Aussicht von h...",Friday,195


### Checking the Vocabulary of the Artists

In [5]:
grouped_data = data.groupby("interpret")

tokenized_words = grouped_data["lyrics"].apply(lambda x: ' '.join(x).split())

unique_word_counts = tokenized_words.apply(lambda x: len(set(x)))

num_songs = grouped_data.size()


df = pd.DataFrame({'Interpret': unique_word_counts.index, 'Word Count': unique_word_counts, 'Number of Songs': num_songs})

fig = px.scatter(df, x='Word Count', y='Number of Songs', hover_data=['Interpret'], title='Number of Songs vs. Word Count')

fig.show()

### Checking the 10 most used words for each Genre

Anmerkung Stand 31.05: Muss mit custom Stopwords gearbeitet werden, sowohl sapcy als auch Stopwordsiso liefern schelchte ergebnisse. Whitespace dafür fixed.

In [21]:
def filter_stopwords(tokens):
    german_stopwords = set(stopwords(["de"]))
    filtered_tokens = []
    for token in tokens:
        if token.lower_ not in german_stopwords and token.text not in string.punctuation and not token.is_space:
            filtered_tokens.append(token.lower_)
    return filtered_tokens

In [22]:
data["filtered_lyrics"] = data["lyrics"].apply(lambda x: filter_stopwords(nlp(x)))

In [23]:
data["word_counts"] = data["filtered_lyrics"].apply(lambda x: Counter(set(x)))

In [24]:
grouped_data = data.groupby("genre")

In [25]:
genre_word_counts = {}
for genre, group in grouped_data:
    word_counts = group["word_counts"].sum()
    genre_word_counts[genre] = word_counts

top_10_words_per_genre = {}
for genre, word_counts in genre_word_counts.items():
    # Exclude whitespace from the top words list
    word_counts.pop('', None)
    # Sort the words based on song count only
    top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    top_10_words_per_genre[genre] = top_words

for genre, top_words in top_10_words_per_genre.items():
    print("Genre:", genre)
    df = pd.DataFrame(top_words, columns=["Word", "Song Count"])
    print(df)
    print()

Genre: Pop
      Word  Song Count
0    leben         274
1     weiß         240
2     welt         226
3    nacht         197
4  einfach         190
5     komm         174
6    liebe         170
7     herz         168
8     lass         162
9    augen         154

Genre: Rap
      Word  Song Count
0    leben         636
1     mach         567
2     komm         561
3     weiß         548
4      'ne         462
5     welt         447
6     yeah         434
7      sag         417
8   scheiß         394
9  einfach         384

Genre: Rock
      Word  Song Count
0     welt         206
1    leben         206
2     weiß         202
3    nacht         151
4    liebe         132
5    schön         129
6  einfach         123
7      'ne         119
8     lass         116
9      ...         114

Genre: Schlager
      Word  Song Count
0    nacht         197
1    liebe         163
2     herz         141
3    leben         130
4     weiß         103
5  einfach         103
6     lass         101
7   

In [26]:
### Example on how to plot the data, couldnt really find anything prettier (didnt search that long tho)
table_data = []
headers = list(top_10_words_per_genre.keys())
for i in range(10):
    row_data = [top_10_words_per_genre[genre][i] for genre in top_10_words_per_genre]
    table_data.append(row_data)

table = tabulate(table_data, headers, tablefmt="pretty")
print(table)
# nicht word count sondern songs

+------------------+------------------+------------------+------------------+
|       Pop        |       Rap        |       Rock       |     Schlager     |
+------------------+------------------+------------------+------------------+
|  ('leben', 274)  |  ('leben', 636)  |  ('welt', 206)   |  ('nacht', 197)  |
|  ('weiß', 240)   |  ('mach', 567)   |  ('leben', 206)  |  ('liebe', 163)  |
|  ('welt', 226)   |  ('komm', 561)   |  ('weiß', 202)   |  ('herz', 141)   |
|  ('nacht', 197)  |  ('weiß', 548)   |  ('nacht', 151)  |  ('leben', 130)  |
| ('einfach', 190) |   ("'ne", 462)   |  ('liebe', 132)  |  ('weiß', 103)   |
|  ('komm', 174)   |  ('welt', 447)   |  ('schön', 129)  | ('einfach', 103) |
|  ('liebe', 170)  |  ('yeah', 434)   | ('einfach', 123) |  ('lass', 101)   |
|  ('herz', 168)   |   ('sag', 417)   |   ("'ne", 119)   |   ('welt', 98)   |
|  ('lass', 162)   | ('scheiß', 394)  |  ('lass', 116)   |   ('komm', 92)   |
|  ('augen', 154)  | ('einfach', 384) |   ('...', 114)   |   ('h

### Sentiment Analysis of a Genre

In [44]:
genre_data = data[data["genre"] == "Rap"]

sentiments = []
for lyric in genre_data["lyrics"]:
    blob = TextBlobDE(lyric)
    sentiment = blob.sentiment
    sentiments.append((lyric, sentiment.polarity, sentiment.subjectivity))

sentiments_df = pd.DataFrame(sentiments, columns=["Lyric", "Polarity", "Subjectivity"])
print(sentiments_df.head())

                                               Lyric  Polarity  Subjectivity
0   Okay, Ansage, Freundchen, ich warne dich Ey, ...  0.067500      0.100000
1    Ey Leute, passt doch auf, ihr lebt in Gefahr...  0.115972      0.006944
2   Okay, Ellen lang und elegant Keule Mutterfick... -0.031250      0.015625
3   Hallihallöchen, ist lange her Schatz Ich vers...  0.048352      0.000000
4    Ich steh auf am Kleister riechen und gleiche... -0.054167      0.144444


In [29]:
def analyze_sentiment(data, genre):
    genre_data = data[data["genre"] == genre]

    sentiments = []
    lyrics_sentiments = []

    for lyric in genre_data["lyrics"]:
        blob = TextBlobDE(lyric)
        sentiment = blob.sentiment.polarity
        sentiments.append(sentiment)
        lyrics_sentiments.append((lyric, sentiment))

    avg_polarity = sum(sentiments) / len(sentiments)
    #lyrics_sentiments.sort(key=lambda x: x[1], reverse=True)
    #best_lyric = lyrics_sentiments[0][0]
    #worst_lyric = lyrics_sentiments[-1][0]

    return avg_polarity#, best_lyric, worst_lyric

grouped_data = data.groupby("genre")
for genre, group in grouped_data:
    avg_polarity = analyze_sentiment(data, genre)
    #avg_polarity, best_lyric, worst_lyric = analyze_sentiment(data, genre)
    print("Genre:", genre)
    print("Average Polarity:", avg_polarity)
    #print("Best Polarity Lyric:\n", best_lyric)
    #print("Worst Polarity Lyric:\n", worst_lyric)
    print()

Genre: Pop
Average Polarity: 0.1296460012286485

Genre: Rap
Average Polarity: 0.07332280483746088

Genre: Rock
Average Polarity: 0.0995539542870728

Genre: Schlager
Average Polarity: 0.1397368378330272

