# Processing of survey answers

In [3]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json
from typing import List

In [4]:
# Function to load data
def load_raw_data():
    raw_data_file = "raw_data.json"
    with open(raw_data_file) as f:
        data = json.load(f)
        df = pd.json_normalize(data)

    df['from.text'] = df['from.text'].astype('str')
    df['to.text'] = df['to.text'].astype('str')
    return df

In [3]:
# Convert survey responses df with of [questionId, response, academicLevel]
def to_qid_res_level(df: pd.DataFrame) -> pd.DataFrame:
    # Drop useless rows
    df_data = df.drop(index=[0,1])
    
    # Extract academic status data (responseid -> Qlevel)
    academic_status = df_data[['ResponseId', 'Qlevel']].dropna()
    
    # Remove useless columns and melt responses into one row per question-response pair
    df_qid_res = df_data\
        .drop(axis = 'columns', columns = ['RecordedDate', 'Qlevel'])\
        .dropna(how = 'all')\
        .melt(id_vars=['ResponseId'],
              var_name="qid",
              value_name="res")\
        .dropna()
    
    # Add academic level column
    df_qid_res_level = df_qid_res.merge(academic_status, on = 'ResponseId')
    return df_qid_res_level

In [4]:
# Convert likert to numeric
def to_numeric_likert(df: pd.DataFrame) -> pd.DataFrame:
    df['res'] = df['res']\
        .map({
            "Strongly agree": 1,
            "Somewhat agree": 2,
            "Neither agree nor disagree": 3,
            "Somewhat disagree": 4,
            "Strongly disagree": 5
        })\
        .astype('int')
    return df

In [5]:
# Convert questions where paragraphs were reversed (their qid has "-rev" appended)
# This is done by removing the "-rev" suffix from the id and inverting the likert response
def to_unswapped(df: pd.DataFrame) -> pd.DataFrame:
    # Add "was_rev" column to indicate whether the question was reversed in the survey
    df['was_rev'] = ""
    # Unswap
    def de_swap_qid_res(row):
        if (row.qid.endswith('-rev')):
            row.qid = row.qid[:-4]
            row.res = 6 - row.res
            row.was_rev = True
        else:
            row.was_rev = False
        return row
    return df.apply(de_swap_qid_res, axis=1, result_type='broadcast')

In [6]:
# Remove duplicated responses
# In an earlier version of the survey, some users could see both the normal and reversed
# version of a question. We remove both to avoid problems.
def remove_duplicated_answers(df: pd.DataFrame) -> pd.DataFrame:
    id_cols = ['ResponseId', 'qid']
    dup_filter = df.duplicated(id_cols, keep=False)
    return df[~dup_filter]

In [7]:
# Add ground truth columns
def add_ground_truth_deltas_and_texts(df: pd.DataFrame, raw_data: pd.DataFrame) -> pd.DataFrame:
    cols_to_add = [
        '_id',
        'freDelta', 'fkglDelta',
        'from.text', 'to.text',
        'from.readability.fleschReadingEase', 'from.readability.fleschKincaidGradeLevel',
        'to.readability.fleschReadingEase', 'to.readability.fleschKincaidGradeLevel',
    ]
    raw_data_qid_deltas = raw_data[cols_to_add]\
        .rename(columns={
        '_id': 'qid',
        'from.readability.fleschReadingEase' : 'from.FRE',
        'from.readability.fleschKincaidGradeLevel' : 'from.FKG',
        'to.readability.fleschReadingEase' : 'to.FRE',
        'to.readability.fleschKincaidGradeLevel' : 'to.FKG'
    })
    return df.merge(raw_data_qid_deltas, on = 'qid')

In [8]:
# Map likert [1, 5] -> [-2, 2]
def likert_to_delta(df: pd.DataFrame) -> pd.DataFrame:
    df['res'] = df['res'].map(lambda x: x - 3)
    return df

In [9]:
# Process survey answers
def process_survey_answers(survey_fname):
    with open(survey_fname) as f:
        df = pd.read_csv(f);
    df_qid_res_level = to_qid_res_level(df)
    df_numeric_likert = to_numeric_likert(df_qid_res_level)
    df_unswapped = to_unswapped(df_numeric_likert)
    df_no_dups = remove_duplicated_answers(df_unswapped)
    raw_data_df = load_raw_data()
    df_ground_truth_and_texts = add_ground_truth_deltas_and_texts(df_no_dups, raw_data_df)
    df_final = likert_to_delta(df_ground_truth_and_texts)
    return df_final

In [10]:
# Read survey data and write processed data to file
def survey_processing() -> pd.DataFrame:
    survey_datasets = ['survey_results/survey0.csv', 'survey_results/survey1.csv']
    processed = [
        process_survey_answers(raw_answers)
        for raw_answers
        in survey_datasets]
    all_data: pd.DataFrame = pd.concat(processed)
    return all_data

In [1]:
# Process all survey data
outfname = 'survey_results_processed.csv'
survey_processing().to_csv(outfname, index = False)

NameError: name 'survey_processing' is not defined