In [4]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [6]:
import nltk
stopwords = nltk.corpus.stopwords.words('german')

In [7]:
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
frg_path= "Songs/BRD-Charts/1_xml/"
gdr_path= "Songs/DDR/1_xml/"
ndw_path= "Songs/NDW/1_xml/"

xml_frg = [frg_path+f for f in os.listdir(frg_path) if f.endswith('.xml')]
xml_gdr = [gdr_path+f for f in os.listdir(gdr_path) if f.endswith('.xml')]
xml_ndw = [ndw_path+f for f in os.listdir(ndw_path) if f.endswith('.xml')]

all_xml = [xml_frg,xml_gdr,xml_ndw]

In [8]:
data = []
for category in all_xml:
    for xml_file in category:
        with open(xml_file, 'r', encoding='utf-8') as file:
            xml_data = file.read()

        soup = BeautifulSoup(xml_data, 'xml')
        
        cat = xml_file.split("/")[1].split("-")[0]
        year = int(soup.find("date").text)
        title = soup.find("title").text
        author = soup.find("author").text
        text = " ".join([k.text for k in soup.find("div1", attrs={"type":"song"}).find_all("l")])

        data.append({'Category': cat, 'Year': year, 'Title': title, 'Author': author, 'Text': text})
        
df = pd.DataFrame(data)
        

In [9]:
ndw_filter = (df["Category"]=="NDW") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
ndw_sub = df[ndw_filter].sample(n=202, random_state=42)
brd_filter = (df["Category"]=="BRD") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
brd_sub = pd.concat([df[brd_filter],ndw_sub], ignore_index=True)
ddr_filter = (df["Category"]=="DDR") & (df['Year'] >= 1970) & (df['Year'] <= 1990)
df = pd.concat([df[ddr_filter],brd_sub], ignore_index=True)

#we now rename the NDW category to BRD, because the belong to the BRD
df['Category'] = df['Category'].replace('NDW', 'BRD')
#the dataframe that remains, contains now out of 1000 Songs of the GDR from 1970 to 1990 and 1000 Songs from the FRG from 1970 to 1990

In [13]:
def sent_to_words(sentences):
    for sentence in sentences:
        
        yield(gensim.utils.simple_preprocess(str(sentence)))  # deacc=True removes punctuations

data_words = list(sent_to_words(df["Text"]))

print(data_words[:1])

[['kennst', 'du', 'das', 'land', 'mit', 'seinen', 'alten', 'eichen', 'das', 'land', 'von', 'einstein', 'von', 'karl', 'marx', 'und', 'bach', 'wo', 'jede', 'antwort', 'endet', 'mit', 'dem', 'fragezeichen', 'wo', 'ich', 'ein', 'zimmer', 'habe', 'unterm', 'dach', 'wo', 'sich', 'so', 'viele', 'wegen', 'früher', 'oft', 'noch', 'schämen', 'wo', 'mancher', 'vater', 'eine', 'frage', 'nicht', 'versteht', 'wo', 'ihre', 'kinder', 'ihnen', 'das', 'nicht', 'übelnehmen', 'weil', 'seine', 'antwort', 'im', 'geschichtsbuch', 'steht', 'hier', 'schaff', 'ich', 'selber', 'was', 'ich', 'einmal', 'werde', 'hier', 'geb', 'ich', 'meinem', 'leben', 'einen', 'sinn', 'hier', 'hab', 'ich', 'meinen', 'teil', 'von', 'unsrer', 'erde', 'der', 'kann', 'so', 'werden', 'wie', 'ich', 'selber', 'bin', 'das', 'ist', 'das', 'land', 'mit', 'seinen', 'seen', 'und', 'wäldern', 'das', 'kleine', 'land', 'das', 'man', 'an', 'einem', 'tag', 'durchfährt', 'wo', 'man', 'was', 'wird', 'auch', 'ohne', 'seine', 'eltern', 'doch', 'auch'

In [19]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=50)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(bigram_mod[data_words[1]])

['abends', 'wartest', 'du', 'ganz', 'zufällig', 'am', 'bus', 'du', 'weißt', 'genau', 'zu', 'welcher', 'zeit', 'ich', 'kommen', 'muss', 'du', 'legst', 'dein', 'lächeln', 'auf', 'und', 'tust', 'immer', 'vertraut', 'dass', 'jeder', 'denken', 'muss', 'ich', 'wäre', 'deine', 'braut', 'deine', 'braut', 'abends', 'kommst', 'du', 'an', 'und', 'lädst', 'dich', 'selber', 'ein', 'und', 'immer', 'mit', 'der', 'flasche', 'wein', 'sprichst', 'große', 'dinge', 'und', 'machst', 'es', 'dir', 'bequem', 'und', 'kannst', 'dann', 'gar', 'nicht', 'lachen', 'wenn', 'ich', 'dann', 'sage', 'du', 'sollst', 'geh', 'du', 'sollst', 'geh', 'hohoho', 'du', 'kommst', 'nicht', 'in', 'mein', 'bett', 'hohoho', 'machst', 'du', 'auch', 'höflich', 'und', 'nett', 'hohoho', 'alles', 'routine', 'ich', 'hab', 'dich', 'erkannt', 'du', 'kommst', 'nicht', 'in', 'mein', 'bett', 'vor', 'deinen', 'freunden', 'drehst', 'du', 'und', 'spielst', 'den', 'king', 'als', 'wär', 'ich', 'nur', 'ein', 'kleiner', 'fisch', 'der', 'an', 'dir', 'h

In [27]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_.lower() for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [28]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('de_core_news_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['kennst', 'alt', 'eich', 'land', 'einstein', 'antwort', 'enden', 'fragezeichen', 'zimmer', 'dach', 'früh', 'oft', 'schämen', 'vater', 'frage', 'verstehen', 'kind', 'übelnehmen', 'antwort', 'geschichtsbuch', 'stehen', 'schaff', 'selber', 'geb', 'leben', 'sinn', 'teil', 'unsr', 'erde', 'selber', 'seen', 'klein', 'land', 'tag', 'durchfähren', 'eltern', 'beziehung', 'manchmal', 'wert', 'stehen', 'schule', 'klassenzimmer', 'riechen', 'heut', 'immer', 'terpentin', 'heut', 'schimm', 'lehrer', 'mütze', 'ziehn', 'schaff', 'selber', 'geb', 'leben', 'sinn', 'teil', 'unsr', 'erde', 'selber', 'land', 'fabrik', 'gehören', 'schon', 'aufstehen', 'tisch', 'hören', 'trotzdem', 'gehen', 'wohnungsämtern', 'hoffnung', 'verlieren', 'parteitag', 'darüber', 'sorgen', 'machen', 'leute', 'selber', 'reparieren', 'werkzeug', 'wissen', 'macht', 'schaff', 'selber', 'geb', 'leben', 'sinn', 'teil', 'unsr', 'erde', 'selber', 'land', 'problem', 'züge', 'stoppen', 'fenster', 'klirren', 'reden', 'spät', 'kind', 'spiele

In [36]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 5), (32, 3), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 3), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 7), (58, 3), (59, 1), (60, 1), (61, 1), (62, 3), (63, 1), (64, 1), (65, 3), (66, 1), (67, 1), (68, 1), (69, 1), (70, 3), (71, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1)]]


In [37]:
print('Total Vocabulary Size:', len(id2word))
id2word.filter_extremes(no_below=4, no_above=0.95)
print('Total Vocabulary Size:', len(id2word))


Total Vocabulary Size: 13788
Total Vocabulary Size: 2745
