In [None]:
## https://medium.com/@stasinskipawel/filozofia-disco-polo-na-przyk%C5%82adzie-tw%C3%B3rczo%C5%9Bci-zespo%C5%82u-figo-fagot-podej%C5%9Bcie-analityczne-ef52a8e4896f

In [207]:
import glob
import os

In [208]:
file_list = glob.glob(os.path.join(os.getcwd(), "lyrics", "*.txt"))
data = []
for file_path in file_list:
    with open(file_path, "r", encoding='utf-8') as f_input:
        data.append({'song_name': os.path.basename(file_path.replace(".txt", "")), 
                     'song_lyrics': f_input.read()})

In [209]:
import polars as pl

In [210]:
df = pl.DataFrame(data)

In [240]:
print(df)

shape: (87, 2)
┌───────────────────────────────────┬───────────────────────────────────┐
│ song_name                         ┆ song_lyrics                       │
│ ---                               ┆ ---                               │
│ str                               ┆ str                               │
╞═══════════════════════════════════╪═══════════════════════════════════╡
│ 17_Girls_In_A_Row                 ┆ I fucked 17 girls in a row last … │
│ 1987                              ┆ Appetite For Destruction Blowing… │
│ Ain't_Dead_Yet                    ┆ I'm a little worse for wear And … │
│ All_I_Wanna_Do_Is_Fuck_(Myself_T… ┆ Sexy ahahahaha You're looking go… │
│ …                                 ┆ …                                 │
│ Why_Can't_You_Trust_Me            ┆ I want you to know one thing You… │
│ Wrong_Side_Of_The_Tracks_(Out_In… ┆ My mansion is the size of a city… │
│ You're_Beautiful_When_You_Don't_… ┆ Your beauty is deeper than the s… │
│ Zebraman             

In [259]:
from collections import Counter
import re

In [333]:
def clean_text(text):
    text = re.sub(r'[^\w\s\']', '', text)
    text = re.sub(r'â|â s', '\'', text)
    return text

In [334]:
def create_ngrams(text, n=1):
    text = clean_text(text.lower())
    words = re.findall(r'\b\w+(?:\'\w+)?\b', text)
    if n < 1:
        return []
    return [' '.join(words[i:i+n]) for i in range(len(words) - n + 1)]


In [350]:
def get_top_ngrams(df, n=1, top=20, song_names=None):
    ngrams = []
    if song_names:
        song_names = [str(name) for name in song_names]
        df = df.filter(pl.col("song_name").is_in(song_names))
    for lyrics in df['song_lyrics'].to_list():
        ngrams.extend(create_ngrams(text=lyrics, n=n))

    return Counter(ngrams).most_common(top)

In [351]:
def ngrams_to_df(df, n: int = 1, top: int = 40, song_names=None):
    top_ngrams = get_top_ngrams(df, n, top = top, song_names=song_names)
    ngrams_df = pl.DataFrame(top_ngrams, schema=['ngram', 'count'])
    return ngrams_df

In [352]:
#Monogram
print(ngrams_to_df(df, n=1, top=40))

shape: (40, 2)
┌───────┬───────┐
│ ngram ┆ count │
│ ---   ┆ ---   │
│ str   ┆ i64   │
╞═══════╪═══════╡
│ the   ┆ 895   │
│ i     ┆ 686   │
│ you   ┆ 670   │
│ a     ┆ 599   │
│ …     ┆ …     │
│ when  ┆ 105   │
│ was   ┆ 104   │
│ what  ┆ 103   │
│ can   ┆ 102   │
└───────┴───────┘


In [353]:
#duogram
print(ngrams_to_df(df, n=2, top=40))

shape: (40, 2)
┌─────────────┬───────┐
│ ngram       ┆ count │
│ ---         ┆ ---   │
│ str         ┆ i64   │
╞═════════════╪═══════╡
│ in the      ┆ 111   │
│ on the      ┆ 80    │
│ i got       ┆ 62    │
│ of the      ┆ 57    │
│ …           ┆ …     │
│ i like      ┆ 27    │
│ she's a     ┆ 27    │
│ that's what ┆ 27    │
│ me cum      ┆ 27    │
└─────────────┴───────┘


In [354]:
#trigram
print(ngrams_to_df(df, n=3, top=40))

shape: (40, 2)
┌──────────────────────┬───────┐
│ ngram                ┆ count │
│ ---                  ┆ ---   │
│ str                  ┆ i64   │
╞══════════════════════╪═══════╡
│ really really really ┆ 33    │
│ weenie weenie weenie ┆ 30    │
│ let me cum           ┆ 26    │
│ me cum in            ┆ 26    │
│ …                    ┆ …     │
│ out the lights       ┆ 15    │
│ ah yeah beautiful    ┆ 14    │
│ yeah beautiful girls ┆ 14    │
│ when you came        ┆ 14    │
└──────────────────────┴───────┘


In [355]:
#quadrogram
print(ngrams_to_df(df, n=4, top=40))

shape: (40, 2)
┌─────────────────────────────┬───────┐
│ ngram                       ┆ count │
│ ---                         ┆ ---   │
│ str                         ┆ i64   │
╞═════════════════════════════╪═══════╡
│ let me cum in               ┆ 26    │
│ really really really really ┆ 24    │
│ weenie weenie weenie weenie ┆ 24    │
│ i got what you              ┆ 23    │
│ …                           ┆ …     │
│ when you came in            ┆ 12    │
│ you came in and             ┆ 12    │
│ came in and blew            ┆ 12    │
│ in and blew me              ┆ 12    │
└─────────────────────────────┴───────┘


In [356]:
#pentagram
print(ngrams_to_df(df, n=5, top=40))

shape: (40, 2)
┌───────────────────────────────────┬───────┐
│ ngram                             ┆ count │
│ ---                               ┆ ---   │
│ str                               ┆ i64   │
╞═══════════════════════════════════╪═══════╡
│ that's what girls are for         ┆ 23    │
│ i got what you want               ┆ 20    │
│ weenie weenie weenie weenie ween… ┆ 18    │
│ let me cum in let                 ┆ 17    │
│ …                                 ┆ …     │
│ your bitch i'm not your           ┆ 11    │
│ bitch i'm not your bitch          ┆ 11    │
│ why can't you trust me            ┆ 11    │
│ can't you trust me baby           ┆ 11    │
└───────────────────────────────────┴───────┘


In [358]:
#monogram '17_Girls_In_A_Row', 'Ain\'t_Dead_Yet'
print(ngrams_to_df(df, n=2, top=40, song_names=['17_Girls_In_A_Row', 'Ain\'t_Dead_Yet']))

shape: (40, 2)
┌─────────────────┬───────┐
│ ngram           ┆ count │
│ ---             ┆ ---   │
│ str             ┆ i64   │
╞═════════════════╪═══════╡
│ girls in        ┆ 9     │
│ in a            ┆ 8     │
│ a row           ┆ 8     │
│ seventeen girls ┆ 8     │
│ …               ┆ …     │
│ his dues        ┆ 2     │
│ row last        ┆ 1     │
│ last night      ┆ 1     │
│ night and       ┆ 1     │
└─────────────────┴───────┘
