In [3]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, strip_multiple_whitespaces, strip_punctuation
from gensim import corpora
from collections import defaultdict
import pprint
import re
from gensim import models
from scipy.sparse import lil_matrix, hstack, csr_matrix, vstack
import gensim.downloader as api
from nltk.stem.wordnet import WordNetLemmatizer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [6]:
def remove_specific_words(s):
    s = re.sub(r"\bLyrics"," ",s)
    s = re.sub(r"\[.+\]"," ",s)
    s = re.sub(r"\b\d+\b Contributors"," ",s)
    s = re.sub(r"Embed"," ",s)
    return s

df = pd.read_csv("data/billboard_lyrics_genres.csv")
df_activeyear = pd.read_csv("data/first_active_years.csv")
df_activeyear = df_activeyear.drop_duplicates(subset=["band_singer","title","year"],ignore_index=True)
df["active_years"] = 0

for i in range(df.shape[0]):
    ay_tmp = df_activeyear.loc[(df_activeyear["band_singer"]==df.loc[i,"band_singer"])&(df_activeyear["title"]==df.loc[i,"title"])&(df_activeyear["year"]==df.loc[i,"year"])].active_years
    if not ay_tmp.empty:
        df.loc[i,"active_years"] = int(ay_tmp)

df_tmp = df.loc[df["active_years"]!=0].reset_index(drop=True)

df["lyrics"] = df["lyrics"].map(remove_specific_words)
df["lyrics"] = df["lyrics"].map(str.lower)
lemmatizer = WordNetLemmatizer()
df["lyrics"] = df["lyrics"].map(lemmatizer.lemmatize)


df["label"] = np.zeros(df.shape[0])

bins = [1970,1980,1990,2000,2010,np.inf]
labels = [0,1,2,3,4,5]
df["label"] = np.where(df["year"] < bins[0], labels[0],
                               np.where(df["year"] < bins[1], labels[1],
                                        np.where(df["year"] < bins[2], labels[2],
                                                 np.where(df["year"] < bins[3], labels[3],
                                                          np.where(df["year"] < bins[4], labels[4], labels[5])))))

In [7]:
def strip_changerow(l):
    l = re.sub(r"\r"," ",l)
    l = re.sub(r"\n"," ",l)
    return l

corpus = []
year = []

for i in range(df.shape[0]):
    ltmp = list(map(strip_changerow,df.loc[i,"lyrics"].split("\r\n\r\n")))
    ltmp = list(map(strip_multiple_whitespaces,ltmp))
    ltmp = list(map(strip_punctuation,ltmp))
    ltmp = [x.strip(' ') for x in ltmp]
    ltmp = [x for x in ltmp if x!=""]
    corpus = corpus + ltmp
    year = year + list(np.ones(len(ltmp),dtype=np.int32)*df.loc[i,"year"])

If you want to train the model or adjust the parameter, you can change the following code. If not, load the model as the next cell.

In [49]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(corpus)

umap_model = UMAP(n_neighbors=10,n_components=10,metric='cosine',low_memory=False)
hdbscan_model = HDBSCAN(min_cluster_size=10,metric="euclidean",prediction_data=True)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = MaximalMarginalRelevance(diversity=.4)
topic_model = BERTopic(embedding_model=sentence_model,verbose=True,n_gram_range=(1,2),
                       umap_model=umap_model,hdbscan_model=hdbscan_model,ctfidf_model=ctfidf_model,
                       representation_model=representation_model)
topics, probs = topic_model.fit_transform(corpus,embeddings)

2023-05-26 23:27:58,869 - BERTopic - Reduced dimensionality
2023-05-26 23:28:10,253 - BERTopic - Clustered reduced embeddings


In [4]:
topic_model = BERTopic.load("BERTmodel")

In [8]:
topics_over_time = topic_model.topics_over_time(corpus,year,nr_bins=20)

20it [14:48, 44.45s/it]


In [11]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [52]:
topic_model.save("BERTmodel")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

