# RQ1 - Influence of the programming style on the performance of the participants

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from fitter import Fitter, get_common_distributions, get_distributions

In [None]:
df = pd.read_csv('./data/preprocessed_experiment_data.csv')
df = df[df["Outlier"] == False]
df

## Descriptive Statistics

### Correctness Ratio

In [None]:
df_iterative = pd.DataFrame([], columns=["ProgrammingStyle", "Algorithm", "NumberOfCorrect", "NumberOfTotal"])
df_recursive = pd.DataFrame([], columns=["ProgrammingStyle", "Algorithm", "NumberOfCorrect", "NumberOfTotal"])
df_higher_order = pd.DataFrame([], columns=["ProgrammingStyle", "Algorithm", "NumberOfCorrect", "NumberOfTotal"])
df_list_comprehension = pd.DataFrame([], columns=["ProgrammingStyle", "Algorithm", "NumberOfCorrect", "NumberOfTotal"])

for algorithm in df["Algorithm"].unique():
    df_algo_tmp = df[df["Algorithm"] == algorithm]

    df_iter_tmp = df_algo_tmp[df_algo_tmp["ProgrammingStyle"] == "iterative"]
    df_rec_tmp = df_algo_tmp[df_algo_tmp["ProgrammingStyle"] == "recursive"]
    df_higher_order_tmp = df_algo_tmp[df_algo_tmp["ProgrammingStyle"] == "higher-order"]
    df_list_comprehension_tmp = df_algo_tmp[df_algo_tmp["ProgrammingStyle"] == "list-comprehension"]

    iter_correct = len(df_iter_tmp[df_iter_tmp["Correctness"] == True])
    iter_total = len(df_iter_tmp)
    rec_correct = len(df_rec_tmp[df_rec_tmp["Correctness"] == True])
    rec_total = len(df_rec_tmp)
    higher_order_correct = len(df_higher_order_tmp[df_higher_order_tmp["Correctness"] == True])
    higher_order_total = len(df_higher_order_tmp)
    list_comprehension_correct = len(df_list_comprehension_tmp[df_list_comprehension_tmp["Correctness"] == True])
    list_comprehension_total = len(df_list_comprehension_tmp)

    df_iterative = df_iterative.append({"ProgrammingStyle": "iterative", "Algorithm": algorithm, "NumberOfCorrect": iter_correct, "NumberOfTotal": iter_total}, ignore_index=True)
    df_recursive = df_recursive.append({"ProgrammingStyle": "recursive", "Algorithm": algorithm, "NumberOfCorrect": rec_correct, "NumberOfTotal": rec_total}, ignore_index=True)
    df_higher_order = df_higher_order.append({"ProgrammingStyle": "higher order", "Algorithm": algorithm, "NumberOfCorrect": higher_order_correct, "NumberOfTotal": higher_order_total}, ignore_index=True)
    df_list_comprehension = df_list_comprehension.append({"ProgrammingStyle": "list comprehension", "Algorithm": algorithm, "NumberOfCorrect": list_comprehension_correct, "NumberOfTotal": list_comprehension_total}, ignore_index=True)

df_iterative = df_iterative.append({"ProgrammingStyle": "iterative", "Algorithm": "total", "NumberOfCorrect": df_iterative["NumberOfCorrect"].sum(), "NumberOfTotal": df_iterative["NumberOfTotal"].sum()}, ignore_index=True)
df_recursive = df_recursive.append({"ProgrammingStyle": "recursive", "Algorithm": "total", "NumberOfCorrect": df_recursive["NumberOfCorrect"].sum(), "NumberOfTotal": df_recursive["NumberOfTotal"].sum()}, ignore_index=True)
df_higher_order = df_higher_order.append({"ProgrammingStyle": "higher order", "Algorithm": "total", "NumberOfCorrect": df_higher_order["NumberOfCorrect"].sum(), "NumberOfTotal": df_higher_order["NumberOfTotal"].sum()}, ignore_index=True)
df_list_comprehension = df_list_comprehension.append({"ProgrammingStyle": "list comprehension", "Algorithm": "total", "NumberOfCorrect": df_list_comprehension["NumberOfCorrect"].sum(), "NumberOfTotal": df_list_comprehension["NumberOfTotal"].sum()}, ignore_index=True)

# merge dataframes to one
df_correctness_ratio = df_iterative
df_correctness_ratio = df_correctness_ratio.append(df_recursive)
df_correctness_ratio = df_correctness_ratio.append(df_higher_order)
df_correctness_ratio = df_correctness_ratio.append(df_list_comprehension)
df_correctness_ratio["Ratio"] = df_correctness_ratio["NumberOfCorrect"] / df_correctness_ratio["NumberOfTotal"]

In [None]:
ax = sns.catplot(x="Algorithm", y="Ratio", hue="ProgrammingStyle", data=df_correctness_ratio, kind="bar", height=5, aspect=5)
ax.tight_layout()
ax.set(ylabel='Correctness Ratio in %');

In [None]:
df_tmp = df_correctness_ratio[["ProgrammingStyle", "Algorithm", "Ratio"]]
df_tmp = df_tmp[df_tmp["Algorithm"] != "total"]
df_tmp["Ratio"] = df_tmp["Ratio"].astype(float)
ax = sns.violinplot(x="ProgrammingStyle", y="Ratio", data=df_tmp)
ax.set(ylim=(0.0, 1.0));

In [None]:
df_tmp = df_correctness_ratio[["ProgrammingStyle", "Algorithm", "Ratio"]]
df_tmp = df_tmp[df_tmp["Algorithm"] != "total"]
df_tmp["Ratio"] = df_tmp["Ratio"].astype(float)
ax = sns.boxplot(x="ProgrammingStyle", y="Ratio", data=df_tmp)
ax.set(ylim=(0.0, 1.0));

### Response Time Data

In [None]:
df_response_time = df[["Algorithm","ProgrammingStyle", "ResponseTime"]]
df_response_time["ResponseTime"] = df_response_time["ResponseTime"].astype(float).apply(lambda x : x/1000.)

display(df_response_time.groupby(["ProgrammingStyle", "Algorithm"]).describe())
display(df_response_time.groupby("ProgrammingStyle").describe())

In [None]:
ylim = (0.0, 150.0)

In [None]:
df_tmp = df_response_time.copy()
df_tmp_2 = df_tmp.copy()
df_tmp_2["Algorithm"] = "total"
df_tmp = df_tmp.append(df_tmp_2)
ax = sns.catplot(x="Algorithm", y="ResponseTime", hue="ProgrammingStyle", data=df_tmp, kind="bar", height=5, aspect=5)
ax.tight_layout()
ax.set(ylim=ylim)
ax.set(ylabel='Response Time in Seconds');

In [None]:
ax = sns.catplot(x="Algorithm", y="ResponseTime", hue="ProgrammingStyle", data=df_tmp, kind="violin", width=1, height=5, aspect=5)
ax.tight_layout()
ax.set(ylabel='Response Time in Seconds')
ax.set(ylim=ylim);

In [None]:
ax = sns.violinplot(x="ProgrammingStyle", y="ResponseTime", data=df_response_time)
ax.set(ylabel='Response Time in Seconds')
ax.set(ylim=ylim);

In [None]:
ax = sns.boxplot(x="ProgrammingStyle", y="ResponseTime", data=df_response_time)
ax.set(ylim=ylim);

## Inferential Statistics

### Correctness Ratio

In [None]:
# apply chi square test to see if there is a significant difference between programming styles
display(df_correctness_ratio[df_correctness_ratio["Algorithm"]=="total"])
stats.chisquare(df_correctness_ratio[df_correctness_ratio["Algorithm"]=="total"]["NumberOfCorrect"])

### Response Time Data

In [None]:
# check if data is normal distributed
print("Iterative:", stats.shapiro(df_response_time[df_response_time["ProgrammingStyle"]=="iterative"]["ResponseTime"]))
print("Recursive:", stats.shapiro(df_response_time[df_response_time["ProgrammingStyle"]=="recursive"]["ResponseTime"]))
print("Higher Order:", stats.shapiro(df_response_time[df_response_time["ProgrammingStyle"]=="higher-order"]["ResponseTime"]))
print("List Comprehension:", stats.shapiro(df_response_time[df_response_time["ProgrammingStyle"]=="list-comprehension"]["ResponseTime"]))

#### Find out distribution of response time

In [None]:
fitter_iterative = Fitter(df_response_time[df_response_time["ProgrammingStyle"]=="iterative"]["ResponseTime"].array, distributions=get_common_distributions())
fitter_iterative.fit()
fitter_recursive = Fitter(df_response_time[df_response_time["ProgrammingStyle"]=="recursive"]["ResponseTime"].array, distributions=get_common_distributions())
fitter_recursive.fit()
fitter_higher_order = Fitter(df_response_time[df_response_time["ProgrammingStyle"]=="higher-order"]["ResponseTime"].array, distributions=get_common_distributions())
fitter_higher_order.fit()
fitter_list_comprehension = Fitter(df_response_time[df_response_time["ProgrammingStyle"]=="list-comprehension"]["ResponseTime"].array, distributions=get_common_distributions())
fitter_list_comprehension.fit()
fitter_total = Fitter(df_response_time["ResponseTime"].array, distributions=get_common_distributions())
fitter_total.fit()

In [None]:
display(fitter_iterative.summary())

In [None]:
display(fitter_recursive.summary())

In [None]:
display(fitter_higher_order.summary())

In [None]:
display(fitter_list_comprehension.summary())

In [None]:
display(fitter_total.summary())

#### Compare values of response time distribution