In [4]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [6]:
import nltk
stopwords = nltk.corpus.stopwords.words('german')

In [7]:
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
frg_path= "Songs/BRD-Charts/1_xml/"
gdr_path= "Songs/DDR/1_xml/"
ndw_path= "Songs/NDW/1_xml/"

xml_frg = [frg_path+f for f in os.listdir(frg_path) if f.endswith('.xml')]
xml_gdr = [gdr_path+f for f in os.listdir(gdr_path) if f.endswith('.xml')]
xml_ndw = [ndw_path+f for f in os.listdir(ndw_path) if f.endswith('.xml')]

all_xml = [xml_frg,xml_gdr,xml_ndw]

In [8]:
data = []
for category in all_xml:
    for xml_file in category:
        with open(xml_file, 'r', encoding='utf-8') as file:
            xml_data = file.read()

        soup = BeautifulSoup(xml_data, 'xml')
        
        cat = xml_file.split("/")[1].split("-")[0]
        year = int(soup.find("date").text)
        title = soup.find("title").text
        author = soup.find("author").text
        text = " ".join([k.text for k in soup.find("div1", attrs={"type":"song"}).find_all("l")])

        data.append({'Category': cat, 'Year': year, 'Title': title, 'Author': author, 'Text': text})
        
df = pd.DataFrame(data)
        

In [9]:
ndw_filter = (df["Category"]=="NDW") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
ndw_sub = df[ndw_filter].sample(n=202, random_state=42)
brd_filter = (df["Category"]=="BRD") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
brd_sub = pd.concat([df[brd_filter],ndw_sub], ignore_index=True)
ddr_filter = (df["Category"]=="DDR") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
df = pd.concat([df[ddr_filter],brd_sub], ignore_index=True)

#we now rename the NDW category to BRD, because the belong to the BRD
df['Category'] = df['Category'].replace('NDW', 'BRD')
#the dataframe that remains, contains now out of 1000 Songs of the GDR from 1970 to 1990 and 1000 Songs from the FRG from 1970 to 1990

In [13]:
def sent_to_words(sentences):
    for sentence in sentences:
        
        yield(gensim.utils.simple_preprocess(str(sentence)))  # deacc=True removes punctuations

data_words = list(sent_to_words(df["Text"]))

print(data_words[:1])

[['kennst', 'du', 'das', 'land', 'mit', 'seinen', 'alten', 'eichen', 'das', 'land', 'von', 'einstein', 'von', 'karl', 'marx', 'und', 'bach', 'wo', 'jede', 'antwort', 'endet', 'mit', 'dem', 'fragezeichen', 'wo', 'ich', 'ein', 'zimmer', 'habe', 'unterm', 'dach', 'wo', 'sich', 'so', 'viele', 'wegen', 'früher', 'oft', 'noch', 'schämen', 'wo', 'mancher', 'vater', 'eine', 'frage', 'nicht', 'versteht', 'wo', 'ihre', 'kinder', 'ihnen', 'das', 'nicht', 'übelnehmen', 'weil', 'seine', 'antwort', 'im', 'geschichtsbuch', 'steht', 'hier', 'schaff', 'ich', 'selber', 'was', 'ich', 'einmal', 'werde', 'hier', 'geb', 'ich', 'meinem', 'leben', 'einen', 'sinn', 'hier', 'hab', 'ich', 'meinen', 'teil', 'von', 'unsrer', 'erde', 'der', 'kann', 'so', 'werden', 'wie', 'ich', 'selber', 'bin', 'das', 'ist', 'das', 'land', 'mit', 'seinen', 'seen', 'und', 'wäldern', 'das', 'kleine', 'land', 'das', 'man', 'an', 'einem', 'tag', 'durchfährt', 'wo', 'man', 'was', 'wird', 'auch', 'ohne', 'seine', 'eltern', 'doch', 'auch'

In [19]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(bigram_mod[data_words[1]])

['abends', 'wartest', 'du', 'ganz', 'zufällig', 'am', 'bus', 'du', 'weißt', 'genau', 'zu', 'welcher', 'zeit', 'ich', 'kommen', 'muss', 'du', 'legst', 'dein', 'lächeln', 'auf', 'und', 'tust', 'immer', 'vertraut', 'dass', 'jeder', 'denken', 'muss', 'ich', 'wäre', 'deine', 'braut', 'deine', 'braut', 'abends', 'kommst', 'du', 'an', 'und', 'lädst', 'dich', 'selber', 'ein', 'und', 'immer', 'mit', 'der', 'flasche', 'wein', 'sprichst', 'große', 'dinge', 'und', 'machst', 'es', 'dir', 'bequem', 'und', 'kannst', 'dann', 'gar', 'nicht', 'lachen', 'wenn', 'ich', 'dann', 'sage', 'du', 'sollst', 'geh', 'du', 'sollst', 'geh', 'hohoho', 'du', 'kommst', 'nicht', 'in', 'mein', 'bett', 'hohoho', 'machst', 'du', 'auch', 'höflich', 'und', 'nett', 'hohoho', 'alles', 'routine', 'ich', 'hab', 'dich', 'erkannt', 'du', 'kommst', 'nicht', 'in', 'mein', 'bett', 'vor', 'deinen', 'freunden', 'drehst', 'du', 'und', 'spielst', 'den', 'king', 'als', 'wär', 'ich', 'nur', 'ein', 'kleiner', 'fisch', 'der', 'an', 'dir', 'h

In [27]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_.lower() for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [47]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('de_core_news_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])

print(data_lemmatized[:1])

[['land', 'einstein', 'antwort', 'fragezeichen', 'dach', 'vater', 'frage', 'kind', 'antwort', 'geschichtsbuch', 'schaff', 'geb', 'sinn', 'teil', 'erde', 'seen', 'land', 'tag', 'eltern', 'beziehung', 'klassenzimmer', 'terpentin', 'lehrer', 'mütze', 'schaff', 'geb', 'sinn', 'teil', 'erde', 'land', 'fabrik', 'wohnungsämtern', 'hoffnung', 'parteitag', 'leute', 'werkzeug', 'schaff', 'geb', 'sinn', 'teil', 'erde', 'land', 'problem', 'züge', 'fenster', 'kind', 'mutter', 'trümmerhaufe', 'land', 'mutter']]


In [61]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

print('Total Vocabulary Size:', len(id2word))
id2word.filter_extremes(no_below=5)
print('Total Vocabulary Size:', len(id2word))

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

Total Vocabulary Size: 7084
Total Vocabulary Size: 906
[[(0, 2), (1, 1), (2, 1), (3, 3), (4, 1), (5, 1), (6, 1), (7, 3), (8, 1), (9, 2), (10, 5), (11, 1), (12, 1), (13, 2), (14, 1), (15, 3), (16, 3), (17, 1), (18, 3), (19, 1), (20, 1)]]


In [62]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [63]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.145*"lied" + 0.072*"komm" + 0.036*"rock_roll" + 0.030*"frieden" + '
  '0.030*"band" + 0.027*"erde" + 0.019*"ende" + 0.017*"schritt" + '
  '0.017*"mensch" + 0.016*"krieg"'),
 (1,
  '0.147*"auge" + 0.073*"brauch" + 0.071*"tür" + 0.038*"papa" + '
  '0.030*"tanz_tanz" + 0.025*"glas" + 0.025*"gefühl" + 0.024*"haar" + '
  '0.024*"stern" + 0.022*"stück"'),
 (2,
  '0.097*"land" + 0.093*"wind" + 0.067*"stein" + 0.047*"stadt" + 0.045*"sand" '
  '+ 0.043*"name" + 0.043*"tod" + 0.037*"bild" + 0.028*"mensch" + '
  '0.019*"erinnerung"'),
 (3,
  '0.119*"kind" + 0.113*"tag" + 0.092*"haus" + 0.035*"welt" + 0.022*"wasser" + '
  '0.017*"blick" + 0.013*"bescheid" + 0.013*"berg" + 0.012*"auge" + '
  '0.012*"schiff"'),
 (4,
  '0.146*"mädchen" + 0.089*"mensch" + 0.064*"wort" + 0.026*"mama" + '
  '0.017*"gefühl" + 0.016*"versteh" + 0.016*"sitz" + 0.016*"stunde" + '
  '0.016*"tag" + 0.014*"bier"'),
 (5,
  '0.078*"lust" + 0.066*"fühl" + 0.061*"seele" + 0.049*"schnee" + 0.037*"fall" '
  '+ 0.036*"wärme