# Generation of Survey Questions

In [7]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json
from typing import List

In [8]:
# Constants
max_len = 670
raw_data_file = "raw_data.json"
md_file = "raw_data.md"
clean_data_questions_file = "data_clean_questions.txt"

In [9]:
# Function to load data
def load_raw_data(in_file):
    with open(in_file) as f:
        data = json.load(f)
        df = pd.json_normalize(data)

    df['from.text'] = df['from.text'].astype('str')
    df['to.text'] = df['to.text'].astype('str')
    return df

## Generation of MarkDown file with questions

In [10]:
# Function to render data as a Markdown file
def generate_markdown(df: pd.DataFrame, fname: str):
    def format_delta(d):
        return f"+{d}" if d > 0 else d
        
    with open(fname, 'w+') as f:
        f.write("# Dump of changes\n\n")
        for index, row in df.iterrows():
            from_id, to_id = row[['from.commitId', 'to.commitId']]
            from_author, to_author = row[['from.commitAuthorEmail', 'to.commitAuthorEmail']]
            from_text, to_text = row[['from.text', 'to.text']]
            f.write(f"## Change in {row['documentRepoName']}\n")
            f.write(f"id: {row['_id']}  \n")
            f.write(f"Flesch reading ease: {format_delta(row['freDelta'])}  \n")
            f.write(f"Flesch—Kincaid grade level: {format_delta(row['fkglDelta'])}  \n")
            f.write("\n")
            f.write(f"| **Version {from_id}** | **Version {to_id}** |\n")
            f.write(f"| :------ | :------ |\n")
            f.write(f"| *{from_author}* | *{to_author}* |\n")
            f.write(f"| ------------------ | ------------------ |\n")
            f.write(f"| {from_text} | {to_text} |\n")
            f.write("\n")
            f.write("\n")

In [11]:
# Run generate md
def run_gen_md():
    df = load_raw_data(raw_data_file)
    generate_markdown(df, md_file)

run_gen_md()

## Generation of survey questions (qualtrics TXT format)

In [12]:
# Function to clean data
def clean_data(df: pd.DataFrame, max_len: int, dirty_text: List[str]) -> pd.DataFrame:
    is_from_text_dirty = df['from.text'].str.contains('|'.join(dirty_text))
    is_to_text_dirty = df['to.text'].str.contains('|'.join(dirty_text))
    is_text_ok_length = (df['from.text'].map(len) < max_len) & (df['to.text'].map(len) < max_len)
    return df[~is_from_text_dirty & ~is_to_text_dirty & is_text_ok_length].reset_index()

In [13]:
# Function to generate a single question
def gen_question(_id: str, text_a: str, text_b: str, swap: bool) -> str:
    id_str = f"{_id}-rev" if swap else _id
    a, b = (text_b, text_a) if swap else (text_a, text_b)
    return f"""
[[Question:MC]]
[[ID:{id_str}]]

<div><strong><span style="font-size:16px;">Paragraph A</span></strong></div>
<div>{a}</div>
<br/>
<div><strong><span style="font-size:16px;">Paragraph B</span></strong></div>
<div>{b}</div>
<br/><hr/><br/>
<div>Paragraph <strong>A</strong> is more readable than paragraph <strong>B</strong>.</div>
<br/>

"""
    

In [14]:
# Function to write questions ndjson
def generate_questions(df: pd.DataFrame, fname: str, q_per_block: int):
    if (q_per_block % 2 != 0):
        raise Exception('q_per_block must be even')

    survey_preamble = """[[AdvancedFormat]]

[[Block:Premble]]

[[Question:Text]]
<div>Hi There. Thank you for taking the time to take part in this study.</div>

[[Question:MC]]
[[ID:Qlevel]]
First, a question about you. What is your education level?

[[Choices]]
Bachelor's degree
Master's degree
PhD
Professor

[[Block:Information]]

[[Question:Text]]
<div>This study is about text readability in the context of software engineering papers.</div>
<div>You will be presented with pairs of paragraphs, and for each pair you will have to state how much you agree with a statement regarding their readability.</div>
<br/>
<div>The <strong>readability</strong> of a text indicates how easy it is to read.</div>
<div>It is the <strong>ease of reading</strong> created by the choice of content, style, design, and organization that fit the prior knowledge, reading skill, interest, and motivation of the audience.</div>
<br/>
<div>Readability is <strong>not</strong> to be confused with legibility, which is more concerned with the visual perception and the layout of the text.</div>
"""
    survey_block_header = "[[Block]]\n"
    survey_question_choices = """[[Choices]]
Strongly agree
Somewhat agree
Neither agree nor disagree
Somewhat disagree
Strongly disagree
"""
    with open(fname, 'w+') as f:
        f.write(survey_preamble)
        f.write("\n")
        for i, row in df.iterrows():
            if (i % (q_per_block / 2) == 0):
                f.write("\n")
                f.write(survey_block_header)
            # Write question
            f.write(gen_question(row['_id'], row['from.text'], row['to.text'], False))
            f.write(survey_question_choices)
            # Write reversed question (guaranteed to be in the same block)
            f.write(gen_question(row['_id'], row['from.text'], row['to.text'], True))
            f.write(survey_question_choices)

        f.write("\n")

In [15]:
# Run generate clean data questions
def run_gen_questions():
    dirty_text = ['reference-type="ref"', 'style="color', '#tab:', 'smallcaps', '\$']
    df = load_raw_data(raw_data_file)
    clean_df = clean_data(df, max_len, dirty_text)
    num_blocks = 5
    questions_per_block = len(clean_df) / num_blocks
    generate_questions(clean_df, clean_data_questions_file, questions_per_block)
    
run_gen_questions()