Imports

In [1]:
from qud_utils import createTreeFromFile, checkBigQuestion, checkLeaves
from tree_comparison import compareTwoTrees
from questions import getQuestions, getQuestionType, getQUDStack, createQuestionDF
from q_givenness import getUnknownNouns, getUnknownVerbs
from maximized_q_anaphoricity import getGivenNouns, getGivenVerbs

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

# Functions for analysing qud Trees

In [2]:
def getTextnameFromQudName(qudFilename):
    return "_".join(qudFilename.split(".")[0].split("_")[:2]) + ".txt"

In [3]:
def maxDepth(qudTree):
    return max([len(path) for path in qudTree.paths_to_leaves()])

def minDepth(qudTree):
    return min([len(path) for path in qudTree.paths_to_leaves()])

def avgDepth(qudTree):
    pathList = [len(path) for path in qudTree.paths_to_leaves()]
    return sum(pathList)/len(pathList)
    
def lenQUD(qudTree):
    return len(qudTree.leaves())

In [4]:
df = pd.DataFrame()

qudDirectory = "microText_QUD/"
microtextDirectory = "microText/"
filenames = os.listdir(qudDirectory)

df["filename"] = filenames
df["textname"] = [getTextnameFromQudName(qudFilename) for qudFilename in filenames]
df["qudTree"] = [createTreeFromFile(qudDirectory + filename) for filename in filenames]

for tree in df["qudTree"]:
    assert checkBigQuestion(tree), "The big question is not \' What is the way things are\'"
    
for tree, microTextFilename in zip(df["qudTree"], df["textname"]):
    assert checkLeaves(microtextDirectory + microTextFilename, tree), "The leaves do not represent the text segments"

df["maxDepth"] = [maxDepth(qudTree) for qudTree in df["qudTree"]]
df["minDepth"] = [minDepth(qudTree) for qudTree in df["qudTree"]]
df["avgDepth"] = [avgDepth(qudTree) for qudTree in df["qudTree"]]
df["lenQUD"] = [lenQUD(qudTree) for qudTree in df["qudTree"]]
display(df.head())

Unnamed: 0,filename,textname,qudTree,maxDepth,minDepth,avgDepth,lenQUD
0,micro_b021_andrew.txt,micro_b021.txt,"[Node(tag=What is the way things are?, identif...",6,3,4.5,4
1,micro_b053_tatjana.txt,micro_b053.txt,"[Node(tag=What is the way things are?, identif...",5,3,4.0,6
2,micro_b010_johann.txt,micro_b010.txt,"[Node(tag=What is the way things are?, identif...",5,3,4.0,5
3,micro_b061_luise.txt,micro_b061.txt,"[Node(tag=What is the way things are?, identif...",7,4,5.5,4
4,micro_b006_johann.txt,micro_b006.txt,"[Node(tag=What is the way things are?, identif...",5,3,4.0,5


In [None]:
df["lenQUD"].mean()

6.157894736842105

# Analyse Questions

In [None]:
QuestionDF = pd.DataFrame()

for tree in df["qudTree"]:
    newQuestionDF = createQuestionDF(tree)
    QuestionDF = pd.concat([QuestionDF,newQuestionDF], ignore_index=True)

Should there be tuition fees for studying in Germany?
Should state health insurance cover complementary medicine?
Should alternative treatments be subsidized?
Should penalties for dog dirt be higher?
Are higher penalties pointless in all cases?
Does Germany have the death penalty?
Does anything speak against the reason that a murderer has already decided on the life or death of another person?
Should intelligence services be regulated more tightly by parliament?
Are the disclosures of Edward Snowden relevant to Germany?
Should everyone contribute to the funding of public broadcasters?
Is it fair of landlords to raise the rent when a new tenant moves in?
Could an increase in the rent be justified?
Should the morning-after pill be sold in pharmacies?
Are pharmacists qualified enough to sell the morning-after pill?
Is the quality of the public channels good enough to be paid for?
Has the medicine been effective?
Will it bring damage to the pharma industry?
Will it bring damage to the conv

In [None]:
display(QuestionDF.head(1000))

## Question Types

In [None]:
QuestionDF["type"].value_counts()

In [None]:
QuestionDF["type"].count()

## Q-Givenness

In [None]:
nlp = spacy.load('en')

In [None]:
QuestionDF["unknownNouns"] = [getUnknownNouns(question, text, underneathText, nlp) for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]

In [None]:
QuestionDF["unknownVerbs"] = [getUnknownVerbs(question, text, underneathText, nlp) for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]

In [None]:
QuestionDF["unknownMaterial"] = [unknownVerbs + unknownNouns for unknownVerbs, unknownNouns in zip(QuestionDF["unknownVerbs"].tolist(), QuestionDF["unknownNouns"].tolist())]

In [None]:
QuestionDF["unknownNouns"].value_counts()

In [None]:
QuestionDF["unknownVerbs"].value_counts()

In [None]:
QuestionDF["unknownMaterial"].value_counts()

## Maximized Q-Anaphoricity

In [None]:
QuestionDF["givenNounsCount"] = [getGivenNouns(question, text, underneathText, nlp)[0] for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]
QuestionDF["givenNouns"] = [getGivenNouns(question, text, underneathText, nlp)[1] for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]
    
QuestionDF["givenVerbsCount"] = [getGivenVerbs(question, text, underneathText, nlp)[0] for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]
QuestionDF["givenVerbs"] = [getGivenVerbs(question, text, underneathText, nlp)[1] for question, text, underneathText in zip(QuestionDF["question"], QuestionDF["priviousTexts"], QuestionDF["underneathTexts"])]

In [None]:
grouped = QuestionDF[QuestionDF["question"] != "What is the way things are?"].groupby(['underneathTexts', 'priviousTexts'])
grouped.head()

In [None]:
# print all questions that have a partner that covers the same span
columns = ["question1", "question2", "priviousTexts", "underneathTexts", "type1", "type2", "givenNounsCount1", "givenNounsCounts2", "givenNouns1","givenNouns2", "givenVerbsCount", "givenVerbs"]
ComparisonDF = pd.DataFrame(columns=columns)

for name, group in grouped:
    if (len(group)==2):
        print("========")
        print(name)
        display(group.head())

In [None]:
ComparisonDF.head()

In [None]:
for n in range(1,3):
    