## Gensim Model

# Inhaltsverzeichnis
1. [Daten einlesen, wählen und vorbereiten](#introduction)
2. [Gensim Modell vorbereiten](#paragraph0)
3. [Topic Modelling aller Daten mit raw text](#paragraph1)
4. [Topic Modelling aller Daten mit POS](#paragraph2)
5. [Topic Modelling der vier besten Genres](#paragraph3)

In [5]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import Phrases

from gensim.models.phrases import Phraser
import pandas as pd

https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

#### Daten einlesen, wählen und vorbereiten <a name="introduction"></a>

In [12]:
df = pd.read_csv('../../datasets/currently_dataset/song_decades_long.csv', index_col=[0])
df.head(1)

Unnamed: 0,artist,decades,text,stemmed_text,POS,Genre1,genre1,number_of_tokens,number_of_types
0,"""Weird Al"" Yankovic",1980s,potato skins potato cakes hash browns and i...,potato skin potato cake hash brown and instan...,french plain enough greasy sour lumpy tough en...,Rock,antiviral pop,12745.0,2264.0


In [13]:
data1 = df[df['Genre1'] == 'Jazz']
data2 = df[df['Genre1'] == 'Hip Hop']
data3 = df[df['Genre1'] == 'Electronic']
data4 = df[df['Genre1'] == 'Folk, World, & Country']
genres = pd.concat([data1, data2, data3, data4], axis=0)
genres.head(1)

Unnamed: 0,artist,decades,text,stemmed_text,POS,Genre1,genre1,number_of_tokens,number_of_types
786,Billie Holiday,1950s,heaven i'm in heaven and my heart beats so t...,heaven i 'm in heaven and my heart beat so th...,lucky much much much much heart happiness chee...,Jazz,adult standards,1074.0,266.0


#### Gensim Modell vorbereiten <a name="paragraph0"></a>

In [20]:
def create_docs(df, column):
    docs = []
    for i, row in df.iterrows():
        text = row[column].split()
        docs.append(text)
    return docs

In [21]:
def build_dictionary(docs):
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return dictionary, corpus

In [22]:
def build_model(df, column, num_topics=8, chunksize=2000, passes=20, iterations=400, eval_every=None):
    docs = create_docs(df, column)
    dictionary, corpus = build_dictionary(docs)
    
    temp = dictionary[0] 
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    return model, dictionary, corpus

In [23]:
# top_topics = model.top_topics(corpus) #, num_words=20)

# # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
# avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
# print('Average topic coherence: %.4f.' % avg_topic_coherence)

# pprint(top_topics)

In [24]:
def visualise_LDA(model, corpus, dictionary):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    return vis

In [25]:
def bi_trigramm(df, column):
    bigram_text= []
    for i, row in df.iterrows():
        text = row.column.split()
        bigram_text.append(text)

    bigram = Phrases(bigram_text, min_count=2, threshold=10) # höherer threshold für weniger phrases.
    trigram = Phrases(bigram[docs], threshold=100)  

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

In [17]:
trigramme = [trigram_mod[bigram_mod[doc]] for doc in bigram_text]

#### Topic Modelling aller Daten mit raw text <a name="paragraph1"></a>

In [26]:
model, dictionary, corpus = build_model(df, 'text')

In [27]:
print('Unique tokens: %d' % len(dictionary))
print('Documents: %d' % len(corpus))
pprint(model.print_topics())

Unique tokens: 4817
Documents: 1038
[(0,
  '0.007*"wanna" + 0.006*"ooh" + 0.005*"forever" + 0.004*"somebody" + '
  '0.004*"sing" + 0.004*"da" + 0.004*"we\'ll" + 0.004*"wait" + 0.004*"lonely" '
  '+ 0.004*"waiting"'),
 (1,
  '0.075*"la" + 0.016*"ooh" + 0.011*"dance" + 0.010*"lord" + 0.008*"hot" + '
  '0.007*"\'bout" + 0.005*"-" + 0.005*"gimme" + 0.005*"rock" + 0.005*"sweet"'),
 (2,
  '0.015*"wanna" + 0.015*"na" + 0.005*"die" + 0.005*"ooh" + 0.004*"[verse" + '
  '0.004*"hell" + 0.004*"christmas" + 0.004*"hate" + 0.004*"kiss" + '
  '0.004*"wish"'),
 (3,
  '0.009*"ya" + 0.009*"rock" + 0.005*"roll" + 0.005*"everybody" + 0.004*"eat" '
  '+ 0.004*"dance" + 0.004*"drink" + 0.004*"fun" + 0.004*"tha" + 0.003*"ha"'),
 (4,
  '0.003*"road" + 0.003*"die" + 0.003*"rain" + 0.003*"went" + 0.003*"lord" + '
  '0.003*"wind" + 0.003*"saw" + 0.003*"dead" + 0.002*"blue" + 0.002*"water"'),
 (5,
  '0.014*"nigga" + 0.013*"shit" + 0.012*"fuck" + 0.012*"ya" + 0.009*"bitch" + '
  '0.009*"niggas" + 0.008*"money" + 

In [31]:
# visualise_LDA(model, dictionary, corpus)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

In [125]:
from gensim.corpora import Dictionary
dictionary = Dictionary(trigramme)

dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in trigramme]

In [126]:
from gensim.models import LdaModel

# Trainingsparameter
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  


temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [127]:
top_topics = model.top_topics(corpus) #, num_words=20)

avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)


Average topic coherence: -0.8081.
[([(0.007443727, 'god'),
   (0.0060704616, 'alright'),
   (0.005858217, 'hate'),
   (0.005264296, 'truth'),
   (0.0050690686, 'everybody'),
   (0.0044107535, 'trying'),
   (0.00439047, 'fight'),
   (0.0042379373, 'hope'),
   (0.0042061238, 'help'),
   (0.0042047002, 'today'),
   (0.0041669663, 'die'),
   (0.004121371, 'running'),
   (0.0040058745, 'wanna'),
   (0.0038664376, 'waiting'),
   (0.0037697405, 'lie'),
   (0.0037447806, 'getting'),
   (0.0037367048, 'yourself'),
   (0.0036204453, "what's"),
   (0.0034642632, 'because'),
   (0.0032913785, 'sing')],
  -0.6240113907125365),
 ([(0.0044751475, 'dance'),
   (0.004287026, 'dead'),
   (0.0034127722, 'drink'),
   (0.0030171676, 'friends'),
   (0.0029116343, 'went'),
   (0.0028661827, 'blue'),
   (0.0027222398, "let's"),
   (0.0025972847, 'living'),
   (0.002551919, 'beat'),
   (0.0024601712, 'may'),
   (0.0024471215, 'pretty'),
   (0.002439964, 'song'),
   (0.002432621, 'god'),
   (0.0024221805, 'make

In [128]:
pprint(model.print_topics())

[(0,
  '0.015*"nigga" + 0.015*"ya" + 0.015*"shit" + 0.014*"fuck" + 0.010*"bitch" + '
  '0.010*"niggas" + 0.009*"money" + 0.006*"yo" + 0.006*"\'em" + 0.006*"ass"'),
 (1,
  '0.006*"die" + 0.005*"blood" + 0.004*"dreams" + 0.004*"dark" + 0.004*"bring" '
  '+ 0.004*"rain" + 0.003*"wind" + 0.003*"waiting" + 0.003*"death" + '
  '0.003*"open"'),
 (2,
  '0.005*"money" + 0.005*"town" + 0.004*"woman" + 0.004*"rock" + 0.004*"song" '
  '+ 0.004*"says" + 0.003*"went" + 0.003*"road" + 0.003*"roll" + '
  '0.003*"sweet"'),
 (3,
  '0.018*"wanna" + 0.006*"we\'ll" + 0.005*"let\'s" + 0.005*"kiss" + '
  '0.004*"alright" + 0.004*"you." + 0.004*"because" + 0.004*"save" + '
  '0.004*"nobody" + 0.004*"wish"'),
 (4,
  '0.029*"wanna" + 0.012*"dance" + 0.010*"let\'s" + 0.009*"ooh" + 0.008*"ya" + '
  '0.008*"body" + 0.007*"ready" + 0.006*"rock" + 0.006*"girls" + '
  '0.006*"party"'),
 (5,
  '0.023*"lord" + 0.014*"-" + 0.010*"god" + 0.007*"sing" + 0.007*"jesus" + '
  '0.007*"heaven" + 0.005*"mama" + 0.005*"born" + 0

In [129]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis