# Word and Sentence Count

- How long are texts?
- How complex?
- Who has the biggest share in a conversation?

# Reading in data

In [None]:
import pandas as pd

In [None]:
path = './data/parliamentary-questions_2023_sample.csv'
data = pd.read_csv(path, index_col=0)

In [None]:
data

# Word Count

In [None]:
sample_answer = data['answer_text'].values[0]
sample_answer

In [None]:
print(sample_answer)

In [None]:
# Counting words
words = sample_answer.split()
words

In [None]:
len(words)

In [None]:
# Defining a function to count words
def count_words(text):
    # Check if the value is a string
    if isinstance(text, str): 
        return len(text.split())
    return None

In [None]:
# Count the number of words for all answers in the dataset
data['word_count'] = data['answer_text'].apply(count_words)

In [None]:
data['word_count'].describe()

In [None]:
# Provide information on the number of words based on the question type
word_count_by_institution = data.groupby('document_type')['word_count'].describe()
print(word_count_by_institution)

# Sentence Count

We are using the Natural Language Processing Toolkit, [nltk](https://www.nltk.org/).


In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Split the sample text into sentences
from nltk import sent_tokenize

sample_answer = sample_answer.replace('\n', ' ')
sentences = sent_tokenize(sample_answer)
sentences

In [None]:
# Function to count the number of sentences of a text
def count_sentences(text):
    if not isinstance(text, str):
        return None
    sentences = sent_tokenize(text)
    return len(sentences)

In [None]:
data['answer_text'] = data['answer_text'].str.replace('\n', ' ')
data['sentence_count'] = data['answer_text'].apply(count_sentences)

In [None]:
data.sentence_count

In [None]:
data.sentence_count.describe()

In [None]:
# group by institution and sum the word counts
sentences_count_by_institution = data.groupby('document_type')['sentence_count'].describe()
print(sentences_count_by_institution)

# Further Resources

Texts vary a lot. While the NLTK `sent_tokenize` covers many use cases for English texts, dependent on the needs custom sentence tokenizer might be a more accurate and better options.

Customizable Tokenizer: [PunktSentenceTokenizer](https://www.nltk.org/api/nltk.tokenize.PunktSentenceTokenizer.html)


