# Content Analysis

- What are the parliamentary questions about?
- What are the most common recurring topcis?
- Which documents talk about a specific document

# Reading in data

In [None]:
import pandas as pd

In [None]:
path = './data/parliamentary-questions_2023_sample.csv'
data = pd.read_csv(path, index_col=1)

In [None]:
data

# Preparation

In [None]:
# Join answers and questions texts together to get all context
data[['question_text', 'answer_text']] = data[['question_text', 'answer_text']].fillna(value='')
data['text'] = data['question_text'].str.cat(data['answer_text'], sep=' -- ')

In [None]:
data.text

# Most frequent Words

We are using the Natural Language Toolkit, also called `nltk`.

More information: https://www.nltk.org/


In [None]:
!pip install nltk

In [None]:
from nltk.probability import FreqDist

FreqDist(['A', 'B', 'A', 'C', 'C', 'C', 'D'])

In [None]:
# Get a sample text
sample = data.text.values[10]
sample

In [None]:
# Get the words of a sample text
sample.split()

In [None]:
# Get the most common words in the sample text
fdist = FreqDist(sample.split())
fdist.most_common()

# Cleaning the text

- write all words in lowercase
- remove punctuation . , ( )
- remove `s 
- remove stopwords (and, this, to, in, the)

In [None]:
punctuation = ['.', ',', '?', ':', ';', '!', '-', '(', ')', '"', "“", '„', '–']

def clean_text(text):
    clean_text = text.lower()
    clean_text = clean_text.replace('’s', '')
    clean_text = clean_text.replace('\n', ' ')
    for punct_char in punctuation:
        clean_text = clean_text.replace(punct_char, '')
    return clean_text

In [None]:
clean_sample = clean_text(sample)
clean_sample.split()

## Remove stopwords

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
stopwords_en = stopwords.words('english')
stopwords_en

In [None]:
# Stopwords in other languages
print(stopwords.fileids())

In [None]:
# Funktion clean_words
def remove_stopwords(text):
    words = text.split()
    clean = []
    for word in words:
        if word.lower() not in stopwords_en:
            clean.append(word)
    return ' '.join(clean)

In [None]:
clean_sample = remove_stopwords(sample)
clean_sample

In [None]:
# get most common words in the sample text
fdist = FreqDist(clean_sample.split())
fdist.most_common()

## Applied to all data

In [None]:
data['clean_text'] = data['text'].apply(clean_text)
data['clean_text'] = data['clean_text'].apply(remove_stopwords)

In [None]:
def get_most_frequent_words(text):
    fdist = FreqDist(text.split())
    return fdist.most_common(50)

In [None]:
get_most_frequent_words(data.clean_text.values[10])

In [None]:
# Get the most important keywords for all texts
data['keywords'] = data['clean_text'].apply(get_most_frequent_words)

In [None]:
data.keywords

## Word Clouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def generate_word_clouds(freq_dict):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(freq_dict)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
generate_word_clouds(dict(data.keywords.values[20]))


## Mehrere Wörter - Bigrams und Trigrams

In [None]:
from nltk.collocations import *
bigrams = nltk.collocations.BigramAssocMeasures()

In [None]:
bigrams = list(nltk.bigrams(data.clean_text.values[10].split()))

fdist = FreqDist(bigrams)
fdist.most_common(10)

In [None]:
data.keywords

# Search for documents by keyword

In [None]:
def get_related_documents(data, keyword, cols):
    keyword = keyword.lower()
    # Create an empty list to store the relevancy scores
    relevancy_scores = []

    # Iterate over each row in the dataframe
    for index, row in data.iterrows():
        document_keywords = dict(row['keywords'])

        # Check if the keyword exists in the document's keywords
        if keyword in document_keywords:
            # Get the frequency of the keyword in the document
            frequency = document_keywords[keyword]
            
            # Append the relevancy score (frequency) and document index to the list
            relevancy_scores.append((frequency, index))
    
    # Sort the relevancy scores in descending order
    relevancy_scores.sort(reverse=True)
    
    # Get the document indices from the sorted relevancy scores
    document_indices = [score[1] for score in relevancy_scores]
    
    # Return the list of documents related to the keyword, sorted by relevancy
    return data[cols].loc[document_indices]

In [None]:
cols = ['document_title','keywords', 'text']
get_related_documents(data, 'education', cols)

# Further Resources

## TF-IDF

TF-IDF (Term Frequency-Inverse Document Frequency) is a measure to identify keywords which are specific for a document given the context of the entire document collection. 

Tutorial by Melanie Walsh: https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html


## Topic Modeling 

Topic Modeling is a statical approach to group documents based on their content

Tutorial by Shashank Kapadia: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0