In [1]:
# Libraries
%matplotlib widget
#%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.patheffects as mpe
from IPython.display import display
import ipywidgets as widgets
import pandas as pd
import numpy as np
import matplotlib
import sys
from scipy import stats
import math

In [2]:
# Full width cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Load data and clean it

## Read .csv & rename columns

In [3]:
# Load our data from file
LocationAllData = "../data-cleanup/merged-cleaned-csv/driving_data_merged_1_to_28.csv"
df = pd.read_csv(LocationAllData)

# Give the columns names in the same style and give some entries clearer names
df = df.rename(columns={
    "Attempt nr":"attempt", "userID": "user_id", "evisID":"evis_id", "timeStamp":"timestamp", "currentStateOfCharge":"current_soc", "energyConsumed":"energy_consumed", "energyUsage":"power_usage",
    "distanceTraveled":"distance_traveled", "throttlePosition":"throttle_position", "breakPosition":"break_position", "steeringWheelRot":"wheel_rot", "yPosition":"road_height"
})

## Assign *distance window*
Add a new column with a so called "distance_window" this will be used to group data points in order to calculate average over distance

In [4]:
# Decide which distance interval to average over
distanceWindowWidth = 50

def assignDistanceWindow(distance):
    distanceWindowIndex = round(distance/distanceWindowWidth)
    return distanceWindowWidth * distanceWindowIndex

# Add a new row which containts which distance group each row belongs to
df["distance_window"] = df["distance_traveled"].apply(assignDistanceWindow)

## Exclusion of participants

Here we should exclude participants who did not execute the task properly, e.g not using the dashboard at all or misunderstanding the task  
List of participants I think should be excluded with a reason attached

* id:5 - Guess - Participant didn't use the Range Estimate number in the dashboard  
* id:10 - Diff - Participant didn't use bars at all, only focused on the speed and energy usage  
* id:16 - Diff - Didn't understand the blue line  
* id:25 - Guess - Seemed like the participant didn't understand that going at a lower speed was allowed

In [5]:
# Id of participants we want to exclude from the study
exclusion_list = [10, 15, 5, 25, 16]
# Remove all rows which are of participants who are in the exclusion_list
df_excluded = df[~df.user_id.isin(exclusion_list)]

# Calculation of averages and STD

## Calculate mean over distance for all attempts separately
I.e a new data frame is created where each attempt performed by a single participant is averaged over each distance window.  
Resulting in fewer rows but with a column with fixed distances

`average_individual` contains each individual attempt over distance traveled averaged at each distance window

In [6]:
average_individual = df_excluded.groupby(["attempt", "evis_id", "user_id", "distance_window"]).mean().reset_index()
#average_individual

## Calculate mean and STD of desired variables for the 4 different groups
These are the four defined groups
1. Diff + COPE1 - Attempt #1
2. Diff + COPE1 - Attempt #2
3. Guess-o-meter - Attempt #1
4. Guess-o-meter - Attempt #2

`average_group_based` contain the average variables of each distance window for each of the four groups


In [7]:
average_groups = average_individual.groupby(["attempt", "evis_id", "distance_window"], as_index=False)

average_group_based = average_groups[["speed", "current_soc", "road_height", "energy_consumed"]].agg(
    {"speed": ["mean", "std"], 
     "current_soc": ["mean", "std"], 
     "energy_consumed": ["mean", "std"],
     "road_height": "mean"
    })

#average_group_based

# Number of successful attempts vs failed ones

## Create a new data frame with the number of fails and successes in each group
`each_user_final_data_success` & `each_user_final_data_fail` contain the last averaged data point for each individual attempt

In [9]:
# Group based on attempt, evis and user id.
each_user = average_individual.groupby(["attempt", "evis_id","user_id"])

# Create a new df with the last datapoint for each attempt
each_user_final_data = pd.concat([each_user.tail(1)])

each_user_final_data_success = each_user_final_data[each_user_final_data["distance_window"] == 8000]
each_user_final_data_fail = each_user_final_data[each_user_final_data["distance_window"] != 8000]

df_fail_success = each_user_final_data_success.groupby(["attempt", "evis_id"])[["user_id"]].count().rename(columns={"user_id":"successes"})
fails = each_user_final_data_fail.groupby(["attempt", "evis_id"])[["user_id"]].count()

# Add column with fails onto our successes df
df_fail_success["fails"] = fails["user_id"]
df_fail_success = df_fail_success.fillna(0)

df_fail_success

Unnamed: 0_level_0,Unnamed: 1_level_0,successes,fails
attempt,evis_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,DiffAndCOPE1,7,5.0
1,GuessOMeter,3,8.0
2,DiffAndCOPE1,12,0.0
2,GuessOMeter,9,2.0


## Test statistical significance
We have a binomial setup since there's only two options, succeeded or failed

# Graph the attempts over distance traveled

## speed over distance

In [None]:
## So

# Average distance traveled

# Statistical significance at different sections of the track