# Datenjournalismus in Python - 
# Eine praktische Einführung in die Programmierung


### Natalie Widmann




Wintersemester 2022 / 2023


Universität Leipzig





# Projektpräsentation - 26. Januar 2023

- insgesamt 11 Teams mit 8 unterschiedlichen Projekten
- pro Team 5 - max.7 Minuten
- Projektpräsentation gibt 10 von 40 Punkten
- Inhalte
    - Projektbeschreibung und aktueller Stand
    - Was ist die wichtigste Zeile oder der wichtigster Absatz im Code und warum?
    - Was war die größte Herausforderung? Wie habt ihr diese gelöst?
- Slides bis zum 25.1 um 12 Uhr an natalie_widmann@posteo.net



# Vorlesung 11 -  Dataframes & Textanalyse

 ### Inhalte
 
 - Daten zusammenfügen
 - Quiz
 - Textanalyse
 - Kurs Evaluation



# Teil 1: Daten aneinander fügen


![Timeline](../imgs/concat.png)

In [None]:
import pandas as pd

df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    }
)

df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    }
)


In [None]:
df2

In [None]:
result = pd.concat([df1, df2, df3])

In [None]:
result

In [None]:
result.reset_index(inplace=True)

In [None]:
result

### Als Spalten aneinander fügen

In [None]:
result = pd.concat([df1, df2, df3], axis=1)

In [None]:
result

### DataFrames mit unterschiedlichen Spalten zusammenfügen

In [None]:
df4 = pd.DataFrame(
    {
        "B": ["B2", "B3", "B6", "B7"],
        "D": ["D2", "D3", "D6", "D7"],
        "F": ["F2", "F3", "F6", "F7"],
    }
)


In [None]:
df1

In [None]:
df4

In [None]:
result = pd.concat([df1, df4], axis=0)
result

In [None]:
result = pd.concat([df1, df4], axis=0, join='inner')
result

## Datensätze zusammensetzen

In [None]:
import pandas as pd

d1 = {'Stadt': ['Dresden', 'Leipzig', 'Chemnitz'],
      'Einwohner': [550000, 600000, 240000],
      'Bürgermeister': ['Dirk Hilbert', 'Burkhard Jung', 'Sven Schultze'],
      'GemeindeID': ['14612000', '14713000', '14511000']
     }
df1 = pd.DataFrame(d1)

df2 = pd.DataFrame({'Stadt': ['Zwickau', 'Chemnitz', 'Görlitz'],
                    'Bevölkerungsdichte': [844, 1100, 822]
                   })

In [None]:
df1

In [None]:
df = pd.concat([df1, df2])
df

### Pandas `merge` 

In [None]:
df = df1.merge(df2)
df

In [None]:
df = df1.merge(df2, how='outer')
df

In [None]:
df = df1.merge(df2, how='left')
df

### DataFrame auf speziellen Spalten zusammenführen

In [None]:
import pandas as pd

import pandas as pd

d1 = {'Stadt': ['Dresden', 'Leipzig', 'Chemnitz'],
      'Einwohner': [550000, 60000, 240000],
      'Bürgermeister': ['Dirk Hilbert', 'Burkhard Jung', 'Sven Schultze'],
      'GemeindeID': ['14612000', '14713000', '14511000']
     }
df1 = pd.DataFrame(d1)

df2 = pd.DataFrame({'Stadt': ['Zwickau', 'Chemnitz', 'Görlitz'],
                    'Bevölkerungsdichte': [844, 1100, 822],
                    'Bürgermeister': ['Constance Arndt', 'Sven Schultze', 'Dirk Hilbert'],
                   })


In [None]:
df2

In [None]:
df = df1.merge(df2, on='Bürgermeister')
df

In [None]:
df = df1.merge(df2, on='Stadt', how='outer')
df

In [None]:
df = df1.merge(df2, on=['Stadt', 'Bürgermeister'], how='outer')
df

# Quiz Time

https://ahaslides.com/8QTDO

![QR Quiz](../imgs/qr_quiz.png)

# Teil 2: Textanalyse

## Wahlprogramme zur Bundestagswahl 2021 

In [None]:
path = '../data/wahlprogramme/fdp.txt'
with open(path) as f:
    fdp_text = f.read()

In [None]:
fdp_text

### Alle Wahlprogramme einlesen

In [None]:
# Alle Files in einem Ordner ausgeben
import os

path = '../data/wahlprogramme/'
for file_path in os.listdir(path):
    print(file_path)


In [None]:
# Alle Files in einem Ordner einlesen
path = '../data/wahlprogramme/'
programs = []
for file_path in os.listdir(path):
    with open(path + file_path) as f:
        programs.append(f.read())

In [None]:
programs

In [None]:
# Alle Files mit Partei einlesen und speichern
path = '../data/wahlprogramme/'
programs = []
for file_path in os.listdir(path):
    # Extrahiere die Partei
    party = file_path.replace('.txt', '').lower()
    # Einlesen der Datei
    with open(path + file_path) as f:     
        program = f.read()
    data = {'party': party, 'text': program}
    programs.append(data)

In [None]:
programs

In [None]:
import pandas as pd
df = pd.DataFrame(programs)

In [None]:
df

## Text säubern

In [None]:
fdp_text

In [None]:
fdp_text.replace('\n', '')

In [None]:
def clean_text(text):
    text = text.strip()
    text = text.replace('\n', ' ')
    return text

In [None]:
clean_text(fdp_text)

### Auf alle Wahlprogramme im Dataframe anwenden

In [None]:
df['clean_text'] = df['text'].apply(clean_text)

In [None]:
df

## Text Analyse

nltk

In [None]:
!pip install nltk

import nltk
nltk.download('punkt')

### Anzahl der Sätze und Anzahl der Wörter

In [None]:
from nltk import sent_tokenize
sentences = sent_tokenize(fdp_text)

In [None]:
sentences

In [None]:
len(sentences)

In [None]:
def count_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)

In [None]:
df['sentences_count'] = df['clean_text'].apply(count_sentences)

In [None]:
df

In [None]:
from nltk import word_tokenize
words = word_tokenize(fdp_text)

In [None]:
words

In [None]:
len(words)

#### Satzzeichen entfernen

In [None]:
satzzeichen = ['.', ',', '?', ':', ';', '!', '-']

clean_words = []
for word in words:
    if word not in satzzeichen:
        clean_words.append(word)

In [None]:
clean_words

In [None]:
len(clean_words)

In [None]:
# Funktion clean_words

def clean_words(text, exclude):
    words = word_tokenize(text)
    clean_words = []
    for word in words:
        if word not in exclude:
            clean_words.append(word)
    return clean_words

In [None]:
satzzeichen = ['.', ',', '?', ':', ';', '!', '-']
df['words'] = df['clean_text'].apply(clean_words, exclude=satzzeichen)
df

In [None]:
df['word_count'] = df['words'].apply(len)
df

### Most frequent Words

In [None]:
fdp_words = df.loc[4, 'words']

In [None]:
from nltk.probability import FreqDist

fdp_words = df.loc[4, 'words']
fdist = FreqDist(fdp_words)
fdist.most_common()

### Anpassungen der clean_words Funktion

- Weiter Satzzeichen (, ), ", entfernen
- alle Wörter kleinschreiben
- Füllwörter entfernen

In [None]:
# Funktion clean_words
def clean_words(text, exclude=satzzeichen):
    words = word_tokenize(text)
    clean_words = []
    for word in words:
        if word not in exclude:
            clean_words.append(word.lower())
    return clean_words

In [None]:
satzzeichen = ['.', ',', '?', ':', ';', '!', '-', '(', ')', '"', "“", '„', '–']
df['words'] = df['clean_text'].apply(clean_words, exclude=satzzeichen)
df

In [None]:
from nltk.probability import FreqDist

fdp_words = df.loc[4, 'words']
fdist = FreqDist(fdp_words)
fdist.most_common(10)

### Füllwörter entfernen

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = stopwords.words('german')
stopwords

In [None]:
# Funktion clean_words
def clean_words(text, exclude):
    words = word_tokenize(text)
    clean = []
    for word in words:
        if word.lower() not in exclude:
            clean.append(word.lower())
    return clean

In [None]:
satzzeichen = ['.', ',', '?', ':', ';', '!', '-', '(', ')', '"', "“", '„', '–', '•', '', '*']
exclude = satzzeichen + stopwords

df['words'] = df['clean_text'].apply(clean_words, exclude=exclude)
df

In [None]:
def most_common_words(words, n=20):
    fdist = FreqDist(words)
    return fdist.most_common(n)

fdp_words = df.loc[4, 'words']
most_common_words(fdp_words)

## Wortwolken

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def generate_word_clouds(freq_dict):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(freq_dict)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
fdp_top_words = most_common_words(fdp_words)
generate_word_clouds(dict(fdp_top_words))


In [None]:
for idx, row in df.iterrows():
    print(row['party'])
    top_words = most_common_words(row['words'], n=40)
    generate_word_clouds(dict(top_words))

## Mehrere Wörter - Bigrams und Trigrams

In [None]:
from nltk.collocations import *
bigrams = nltk.collocations.BigramAssocMeasures()

In [None]:
fdp_words = df.loc[1, 'words']
bigrams = list(nltk.bigrams(fdp_words))

fdist = FreqDist(bigrams)
fdist.most_common(20)

In [None]:
fdp_words = df.loc[1, 'words']
bigrams = list(nltk.trigrams(fdp_words))

fdist = FreqDist(bigrams)
fdist.most_common(20)

In [None]:
def most_common_words(words, n=20, ngrams=''):
    if ngrams == 'bigrams':
        words = list(nltk.bigrams(words))
    if ngrams == 'trigrams':
        words = list(nltk.trigrams(words))
    fdist = FreqDist(words)
    return fdist.most_common(n)

In [None]:
for idx, row in df.iterrows():
    print(row['party'])
    top_words = most_common_words(row['words'], n=20, ngrams='bigrams')
    for idx, word_freq in enumerate(top_words):
        if type(word_freq[0]) != str:
            top_words[idx] = (' '.join(word_freq[0]), word_freq[1])
    generate_word_clouds(dict(top_words))

# Kurs Evaluation

https://ahaslides.com/C7S01

![Kurs Evaluation](../imgs/qr_evaluation.png)