# Processing of survey answers

In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json
from typing import List

In [2]:
# Function to load data
def load_raw_data():
    raw_data_file = "raw_data.json"
    with open(raw_data_file) as f:
        data = json.load(f)
        df = pd.json_normalize(data)

    df['from.text'] = df['from.text'].astype('str')
    df['to.text'] = df['to.text'].astype('str')
    return df

In [3]:
# Convert survey responses df with of [questionId, response, academicLevel]
def to_qid_res_level(df: pd.DataFrame) -> pd.DataFrame:
    # Drop useless rows
    df_data = df.drop(index=[0,1])
    
    # Extract academic status data (responseid -> Qlevel)
    academic_status = df_data[['ResponseId', 'Qlevel']].dropna()
    
    # Remove useless columns and melt responses into one row per question-response pair
    df_qid_res = df_data\
        .drop(axis = 'columns', columns = ['RecordedDate', 'Qlevel'])\
        .dropna(how = 'all')\
        .melt(id_vars=['ResponseId'],
              var_name="qid",
              value_name="res")\
        .dropna()
    
    # Add academic level column
    df_qid_res_level = df_qid_res.merge(academic_status, on = 'ResponseId')
    return df_qid_res_level

In [4]:
# Convert questions where paragraphs were reversed (their qid has "-rev" appended)
# This is done by removing the "-rev" suffix from the id and inverting the likert response
def to_unswapped(df: pd.DataFrame) -> pd.DataFrame:
    # Add "was_rev" column to indicate whether the question was reversed in the survey
    df['was_rev'] = False
    # Map to swap a Likert answer
    swap_likert = {
        "Strongly agree": "Strongly disagree",
        "Somewhat agree": "Somewhat disagree",
        "Neither agree nor disagree": "Neither agree nor disagree",
        "Somewhat disagree": "Somewhat agree",
        "Strongly disagree": "Strongly agree"
    }
    # Function to unswap row
    def de_swap_qid_res(row):
        if (row.qid.endswith('-rev')):
            row.qid = row.qid[:-4]
            row.res = swap_likert[row.res]
            row.was_rev = True
        else:
            row.was_rev = False
        return row
    return df.apply(de_swap_qid_res, axis=1)

In [5]:
# Remove duplicated responses
# In an earlier version of the survey, some users could see both the normal and reversed
# version of a question. We remove both to avoid problems.
def remove_duplicated_answers(df: pd.DataFrame) -> pd.DataFrame:
    id_cols = ['ResponseId', 'qid']
    dup_filter = df.duplicated(id_cols, keep=False)
    return df[~dup_filter]

In [6]:
# Add columns from original data
def add_measured_deltas_and_texts(df: pd.DataFrame, raw_data: pd.DataFrame) -> pd.DataFrame:
    cols_to_add = [
        '_id',
        'freDelta', 'fkglDelta',
        'from.text', 'to.text',
        'from.readability.fleschReadingEase', 'from.readability.fleschKincaidGradeLevel',
        'to.readability.fleschReadingEase', 'to.readability.fleschKincaidGradeLevel',
    ]
    raw_data_qid_deltas = raw_data[cols_to_add]\
        .rename(columns={
        '_id': 'qid',
        'from.readability.fleschReadingEase' : 'from.FRE',
        'from.readability.fleschKincaidGradeLevel' : 'from.FKG',
        'to.readability.fleschReadingEase' : 'to.FRE',
        'to.readability.fleschKincaidGradeLevel' : 'to.FKG'
    })
    return df.merge(raw_data_qid_deltas, on = 'qid')

For all pairs of paragraphs we have the "real" readability delta stored, but for some pairs it increases and for some others it decreases.  
This makes the data confusing, since it is impossible to know what the "correct" answer is for each pair of paragraphs.  

In all cases the statement the survey asked to evaluate was *The first paragraph is more readable than the second*, which means that we implied a decrease in readability.  
Therefore, we map all pairs of paragraphs (and survey responses) so that the "real" readability delta (in Flesch—Kincaid grade) is a decrease. This makes it so that "Strongly agree" and "Somewhat agree" are the expected survey responses.

In [7]:
# Convert all rows to be decreases in readability, so that the expected `res` is always
# "Strongly agree" (or "Somewhat agree")
# A row is considered a decrease in readability based on Flesch—Kincaid grade delta
def to_all_decreases(df: pd.DataFrame) -> pd.DataFrame:
    # Map to swap a Likert answer
    swap_likert = {
        "Strongly agree": "Strongly disagree",
        "Somewhat agree": "Somewhat disagree",
        "Neither agree nor disagree": "Neither agree nor disagree",
        "Somewhat disagree": "Somewhat agree",
        "Strongly disagree": "Strongly agree"
    }
    # Function to swap a row to a decrease
    def as_readability_decrease(row: pd.Series) -> pd.Series:
        # If it's a readability increase swap it to a decrease
        if (row['fkglDelta'] < 0):
            row['from.text'], row['to.text'] = row['to.text'], row['from.text']
            row['from.FRE'], row['to.FRE'] = row['to.FRE'], row['from.FRE']
            row['from.FKG'], row['to.FKG'] = row['to.FKG'], row['from.FKG']
            row['freDelta'] = - row['freDelta']
            row['fkglDelta'] = - row['fkglDelta']
            row['res'] = swap_likert[row['res']]
            row['was_rev'] = not row['was_rev']
        return row
    return df.apply(as_readability_decrease, axis=1)

In [8]:
# Process survey answers
def process_survey_answers(survey_fname):
    with open(survey_fname) as f:
        df = pd.read_csv(f);
    df_qid_res_level = to_qid_res_level(df)
    df_unswapped = to_unswapped(df_qid_res_level)
    df_no_dups = remove_duplicated_answers(df_unswapped)
    raw_data_df = load_raw_data()
    df_all = add_measured_deltas_and_texts(df_no_dups, raw_data_df)
    df_final = to_all_decreases(df_all)
    return df_final

In [9]:
# Read survey data and write processed data to file
def survey_processing() -> pd.DataFrame:
    survey_datasets = ['survey_results/survey0.csv', 'survey_results/survey1.csv']
    processed = [
        process_survey_answers(raw_answers)
        for raw_answers
        in survey_datasets]
    all_data: pd.DataFrame = pd.concat(processed)
    return all_data

In [10]:
# Process all survey data
outfname = 'survey_results_processed.csv'
survey_processing().to_csv(outfname, index = False)