## Combine the Survey datasets

There are two batches of ratings as the first was more of a test how MTurk works

The image order was scrambled for each image so the order has less of an effect on ratings, therefore it needs to be
unscrambled.

Furthermore unapproved ratings (like ones containing only default values) need tp be filtered out

In [1]:
import pandas as pd
import numpy as np
import survey_utils as utils
import random
import math

from pathlib import Path
from IPython.core.display import display, HTML
import scipy

print(scipy.__version__)

# Enables visualizations
visualization = True
relevant_columns = utils.get_relevant_columns_for_visualization(visualization)

# styles to evaluate further
styles = ['original', 'nicer', 'ssmtpiaa_sgd', 'ssmtpiaa_cma', 'expert']

# Load the datasets from both batches
survey1 = pd.read_csv(Path('sets/survey_results/processed/MTurk_Batch_1_approvals.csv'))
survey2 = pd.read_csv(Path('sets/survey_results/processed/MTurk_Batch_2_approvals.csv'))

# Combine the 2 suvey datasets and descramble ratings.
survey_result = utils.preprocess_data(survey1, survey2, visualization)

print('Done!')

1.7.0
Done!


## How often is a style the preffered style?

In [72]:
styles_to_evaluate = ['original', 'expert']

# initialize counter
best_styles = {}
examples = {}
for style in styles_to_evaluate:
    best_styles[style] = 0
    examples[style] = set()

for i in range(survey_result.shape[0]):
    best = 0
    # Get best rating for each image
    for style in styles_to_evaluate:
        best = max(best, survey_result.at[i, f'{style}_rating'])
    # Find out to which style it belongs
    for style in styles_to_evaluate:
         if survey_result.at[i, f'{style}_rating'] == best:
             best_styles[style] += 1
             examples[style].add(i)

best_count = pd.DataFrame(index=styles_to_evaluate)
for style in styles_to_evaluate:
    best_count.at[style, 'prefered_in'] = best_styles[style]
best_count.prefered_in = best_count.prefered_in.astype(int)
display(best_count)

Unnamed: 0,prefered_in
original,1312
expert,3372


## Significance test

In [73]:
base_style = 'original'
enhanced_style = 'expert'

enhanced_is_better = []
random_list = []

for i in range(survey_result.shape[0]):
    best = 0
    # Get best rating for each image
    for style in [base_style, enhanced_style]:
        best = max(best, survey_result.at[i, f'{style}_rating'])
    # Find out to which style it belongs
    for style in [base_style, enhanced_style]:
        if survey_result.at[i, f'{base_style}_rating'] == best:
            enhanced_is_better.append(0)
        else:
            enhanced_is_better.append(1)
pvalues = []
statistics = []

for i in range(50):
    result = stats.ttest_ind(enhanced_is_better, utils.get_random_array(len(enhanced_is_better)), alternative='greater')
    pvalues.append(result.pvalue)
    statistics.append(result.statistic)

print(f'statistic: {np.mean(statistics)}')
print(f'p-value: {np.mean(pvalues)}')

statistic: 30.33327781556948
p-value: 1.2653111283065755e-183


In [74]:
base_style = 'original'
enhanced_style = 'ssmtpiaa_sgd'

enhanced_is_better = []
random_list = []

for i in range(survey_result.shape[0]):
    best = 0
    # Get best rating for each image
    for style in [base_style, enhanced_style]:
        best = max(best, survey_result.at[i, f'{style}_rating'])
    # Find out to which style it belongs
    if survey_result.at[i, f'{enhanced_style}_rating'] == best:
        enhanced_is_better.append(1)

n = survey_result.shape[0]
k = len(enhanced_is_better)
p = 0.5

binom_test = stats.binomtest()

print(type(stats.binom_test(k, n, p, alternative='greater')))

AttributeError: module 'scipy.stats' has no attribute 'binomtest'

## Simple means over all ratings

In [None]:
for style in styles:
    print(f"Mean rating for {style}: {np.mean(survey_result[style]):.2f}")


## Combine individual ratings of an image

In [None]:
survey_grouped = utils.combine_image_ratings(survey_result, visualization, styles)

# select row to preview (between 0 and 499)
row = 20

if visualization:
    display(HTML(pd.DataFrame(survey_grouped.iloc[row][relevant_columns]).transpose().to_html(escape=False)))

## What are the mean variances of each style?

In [None]:
ratings = pd.DataFrame(index=styles)

for style in styles:
    ratings.at[style, 'rating'] = np.mean(survey_result[style])
    ratings.at[style, 'variance'] = np.var(survey_result[style])
    ratings.at[style, 'standard deviation'] = np.std(survey_result[style])

display(ratings)


## How often is a style the preffered style (via mean rating)?

In [None]:
styles_to_evaluate = ['nicer', 'ssmtpiaa_sgd']

# initialize counter
best_styles = {}
examples = {}
for style in styles_to_evaluate:
    best_styles[style] = 0
    examples[style] = set()

for i in range(survey_grouped.shape[0]):
    best = 0
    # Get best rating for each image
    for style in styles_to_evaluate:
        best = max(best, survey_grouped.at[i, f'{style}_rating'])
    # Find out to which style it belongs
    for style in styles_to_evaluate:
         if survey_grouped.at[i, f'{style}_rating'] == best:
             best_styles[style] += 1
             examples[style].add(i)
best_count = pd.DataFrame(index=styles_to_evaluate)
for style in styles_to_evaluate:
    best_count.at[style, 'best_style'] = best_styles[style]
best_count.best_style = best_count.best_style.astype(int)
display(best_count)

if visualization:
    for style in styles_to_evaluate:
        example = random.choice(list(examples[style]))
        diff_to_orig = survey_grouped.iloc[example][f'{style}_rating'] - survey_grouped.iloc[example]["original_rating"]
        display(f'Example where {style} was the preffered style with a : difference of {diff_to_orig:.2f} to the original rating')
        display(HTML(pd.DataFrame(survey_grouped.iloc[example][relevant_columns]).transpose().to_html(escape=False)))