# Preprocessing 01
---
## Read Out the Data from SoSciSurvey and save it in a better format

In [None]:
import re
import pandas as pd

pd.set_option('mode.chained_assignment', None)

### Read in the Data

In [None]:
df_raw = pd.read_excel("./data/data_hir.xlsx")
df_raw = df_raw.drop(0)

### Drop 'useless' rows and columns

In [None]:
df_raw = df_raw.drop(columns=['SERIAL', 'REF', 'QUESTNNR', 'MODE', 'MAILSENT', 'TIME_SUM', 'Q_VIEWER', 'LASTPAGE', 'MAXPAGE', 'MISSING', 'MISSREL', 'TIME_RSI', 'DEG_TIME', 'FINISHED'])
# remove time columns
df_raw = df_raw.drop(columns=['TIME001', 'TIME002', 'TIME004', 'TIME005', 'TIME006', 'TIME007', 'TIME008', 'TIME009', 'TIME010', 'TIME011', 'TIME012', 'TIME013', 'TIME014', 'TIME015', 'TIME016', 'TIME017', 'TIME018', 'TIME019', 'TIME020', 'TIME021', 'TIME022', 'TIME024', 'TIME026', 'TIME028', 'TIME030', 'TIME032', 'TIME034'])

In [None]:
df_raw.to_csv("./data/preprocessed_total_data.csv", index=False)

In [None]:
df_raw

### Transform the Dataframe to hold the algorithm name, the correctness, and response time, the click data and the time data

In [None]:
config_prefix = ["IT", "RE", "HO", "LC"]
config_prefix_variable = ["IV", "RV", "HV", "LV"]

config_id_variable = "CASE"

config_algo_names = [
    "apply",
    "condition",
    "find",
    "is_prime",
    "max",
    "node",
    "prime_factors",
    "quad_mul",
    "students",
    "computer",
    "store",
    "LinkedList",
]

config_answer_variables = [
    [[prefix + str(x).zfill(2) + "_01", prefix + str(x).zfill(2) + "_01a"] for x in range(15, 27)]
    for prefix in config_prefix
]

config_click_variables = [[prefix + str(x).zfill(2) + "_01" for x in range(2, 14)] for prefix in config_prefix_variable]

config_time_variables = [[prefix + str(x).zfill(2) + "_02" for x in range(2, 14)] for prefix in config_prefix_variable]


config_answer_patterns = [
    '^"?\[2,\s?6,\s?12\]"?$',
    '^"?42"?$',
    '^"?\[?2\]?"?$',
    '^"?(True|true)"?$',
    '^"?5"?$',
    '^"?36"?$',
    '^"?\[2,\s?3\]"?$',
    '^"?36"?$',
    '^"?(\["Jasmin",\s?"Florian"\]|\[Jasmin,\s?Florian\])"?$',
    '^"?\[1,\s?5\]"?$',
    '^"?\[0,\s?3\]"?$',
    '^"?16"?$',
]

In [None]:
df_matrix = []

# load data
for algo_idx, _stuff in enumerate(config_algo_names):
    df_array = []
    for detail_idx, _stuff in enumerate(config_prefix_variable):
        df = pd.DataFrame(
            df_raw,
            columns=[
                config_id_variable,
                config_answer_variables[detail_idx][algo_idx][0],
                config_answer_variables[detail_idx][algo_idx][1],
                config_click_variables[detail_idx][algo_idx],
                config_time_variables[detail_idx][algo_idx],
            ],
        )
        df = df.dropna(subset=config_answer_variables[detail_idx][algo_idx], how="all")
        df_array.append(df)
    df_matrix.append(df_array)

In [None]:
cols = ["id", "algo_name", "answer", "correctness", "click_data", "time_data"]

df_IT = pd.DataFrame(columns=cols)
df_RE = pd.DataFrame(columns=cols)
df_HO = pd.DataFrame(columns=cols)
df_LC = pd.DataFrame(columns=cols)

for algo_idx, df_row in enumerate(df_matrix):
    algo_name = config_algo_names[algo_idx]
    pattern = config_answer_patterns[algo_idx]
    regex = re.compile(pattern)
    for detail_idx, df in enumerate(df_row):

        for index, row in df.iterrows():
            id_value = row["CASE"]
            algo_name = config_algo_names[algo_idx]
            answer = row[config_answer_variables[detail_idx][algo_idx]]
            click_data = row[config_click_variables[detail_idx][algo_idx]]
            time_data = row[config_time_variables[detail_idx][algo_idx]]

            result = regex.match(str(answer.iloc[0]))
            correctness = False

            if result is not None:
                correctness = True

            data = pd.DataFrame(
                [[id_value, algo_name, str(answer.iloc[0]), correctness, click_data, time_data]], columns=cols
            )

            if detail_idx == 0:
                df_IT = df_IT.append(data)
            elif detail_idx == 1:
                df_RE = df_RE.append(data)
            elif detail_idx == 2:
                df_HO = df_HO.append(data)
            else:
                df_LC = df_LC.append(data)

In [None]:
df_array_independent = [df_IT, df_RE, df_HO, df_LC]

In [None]:
cols = ["ID", "ProgrammingStyle", "Algorithm", "ResponseTime", "Correctness", "ClickData", "TimeData"]


def fill(list_data, dataframe, programming_style):
    for idx, row in dataframe.iterrows():
        id = row["id"]
        algorithm = row["algo_name"]
        response_time = 0
        for i in str(row["time_data"]).split(" "):
            try:
                response_time = int(i)
            except:
                continue
        correctness = row["correctness"]
        click_data = row["click_data"]
        time_data = row["time_data"]

        list_data.append([id, programming_style, algorithm, response_time, correctness, click_data, time_data])


data = []
fill(data, df_IT, "iterative")
fill(data, df_RE, "recursive")
fill(data, df_HO, "higher-order")
fill(data, df_LC, "list-comprehension")
df = pd.DataFrame(data, columns=cols)
df.loc[(df["Algorithm"] == 'condition'),'Algorithm']='condition_sum'
df.to_csv("./data/preprocessed_experiment_data.csv", index=False)

In [None]:
df