# Wrangling data for in-the-wild robot experiments

## Preamble

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import os

In [2]:
# Load data

file_names = os.listdir("raw")

# We're specifying sheet names here because by default, pandas will join everything into one dataframe
# Add convert_dtypes at the end of every df to prevent it from assigning int to float
observations_oc = pd.read_excel(os.path.join("raw", file_names[0]), sheet_name="OC1").convert_dtypes() 
observations_default = pd.read_excel(os.path.join("raw", file_names[0]), sheet_name="D1").convert_dtypes() 

# We're dropping any NAs for survey responses, which was caused by out pilot study
default = pd.read_excel(os.path.join("raw", 
                                     file_names[1]), 
                        sheet_name=0).dropna(how="all").reset_index(drop=True).convert_dtypes() 
oc = pd.read_excel(os.path.join("raw", 
                                file_names[2]), 
                   sheet_name=0).dropna(how="all").reset_index(drop=True).convert_dtypes() 


## Cleaning Observation data

For observation data, I forgot that I can just fill NA's after data collection, so I made many default entries for rows, even though I didn't collect data for them. Hence, our data is going to look pretty inflated:

In [3]:
print("Default shape:", observations_default.shape)
print("OC shape:", observations_oc.shape)

Default shape: (199, 13)
OC shape: (425, 13)


One way to tell is to look at their group numbers. The default group number is always 0. The only time when this is valid is when it's the first row in our data. Hence, we can remove every other row featuring **Group**==0:

In [4]:
def delete_group_0s(df):
    # Get every row with Group = 0, only grab from 2nd to last rows, and save their indices:
    idx = df[df.Group==0][1:].index
    # Drop those indices and return
    return df.drop(idx)

# Drop extra rows for observation data:
observations_default = delete_group_0s(observations_default)
observations_oc = delete_group_0s(observations_oc)

Let's examine the shape of our final observation data frames:

In [5]:
print("Default shape:", observations_default.shape)
print("OC shape:", observations_oc.shape)

Default shape: (138, 13)
OC shape: (425, 13)


During sessions for OC-SORT, I've mistakened the column of **false_alarm** for **Survey** and vice versa. So, let's switch it around:

In [6]:
observations_oc["false_alarm"], observations_oc["Survey"] = observations_oc["Survey"], observations_oc["false_alarm"]
observations_oc.head()

Unnamed: 0,Time,Group,Participant,Success,Invested,Photo,Emotion,false_alarm,Notes,Survey,I1,I2,I3
0,2023-07-27 12:02:21.965,0,0,0,0,0,N,,,0,,,
1,2023-07-27 12:02:33.701,1,0,0,0,0,N,,,0,,,
2,2023-07-27 12:02:40.055,2,0,0,0,0,N,,,0,,,
3,2023-07-27 12:02:51.530,3,0,0,0,0,N,,,0,,,
4,2023-07-27 12:04:14.732,4,0,0,0,0,N,,,0,,,


Since **false_alarm** is the only boolean with NA, we'll fill it with 0 as the default:

In [7]:
observations_oc.false_alarm.fillna(0, inplace=True)
observations_oc.head()

Unnamed: 0,Time,Group,Participant,Success,Invested,Photo,Emotion,false_alarm,Notes,Survey,I1,I2,I3
0,2023-07-27 12:02:21.965,0,0,0,0,0,N,0,,0,,,
1,2023-07-27 12:02:33.701,1,0,0,0,0,N,0,,0,,,
2,2023-07-27 12:02:40.055,2,0,0,0,0,N,0,,0,,,
3,2023-07-27 12:02:51.530,3,0,0,0,0,N,0,,0,,,
4,2023-07-27 12:04:14.732,4,0,0,0,0,N,0,,0,,,


## Cleaning survey data

There's a shape mismatch between **oc** and **default**:

In [8]:
print("OC-SORT survey shape:", oc.shape)
print("Default survey shape:", default.shape)

OC-SORT survey shape: (8, 17)
Default survey shape: (9, 14)


This is because during the pilot study, I included interview questions in the survey response for OC-SORT, which was a bad idea. Let's make it so that has the same columns as **default**:

In [9]:
oc = oc[default.columns.tolist()]
print("New oc shape:", oc.shape)

New oc shape: (8, 14)


## Checking data integrity

It's likely that I might've made some mistakes during data collection. Let's fix some of them!

### Is Group increasing?

**Group** was collected by incrementing the previous value by 1 for each new group/person that has walked in front of the robot. Since this was done by hand, it's likely that I might've made mistakes during this process.

In [10]:
def monotonic_increasing_by_one(df, cat):
    nums = df[cat].tolist()
    c = nums[0]
    for i in range(1,len(nums)):
        diff = nums[i] - c
        if diff > 1:
            print("This list is not monotonically increasing by 1 or 0. This occurred at index", i, "where the previous number", c, "is", diff, "less than", nums[i])
            break
        else:
            c = nums[i]
    else:
        print("This list is monotonically increasing by 1 or 0")

In [11]:
print("Checking for Default:")
monotonic_increasing_by_one(observations_default, "Group")
print("Checking for OC-SORT:")
monotonic_increasing_by_one(observations_oc, "Group")

Checking for Default:
This list is monotonically increasing by 1 or 0
Checking for OC-SORT:
This list is monotonically increasing by 1 or 0


Good, now our **Group** attribute has been validated.

### Do passerbys in the same group have the same time?

I've recorded this data where people from the same group would encounter Pepper at the same time. Using this answer in [StackOverflow](https://stackoverflow.com/questions/54518504/check-if-group-contains-same-value-in-pandas), we can do just that:

In [12]:
def same_group_diff_time(df):
    # Returns a list of group numbers that don't have the same time
    a = df.groupby('Group').Time.nunique() > 1
    return a[a].index.tolist()

def fix_group_number_time(df):
    # Given a dataframe, find all instances where group numbers match, but time doesn't, then replace all instances
    # of time mismatch with the first Time of the same group
    for n in same_group_diff_time(df):
        d = df[df.Group==n].iloc[0,0]
        df.loc[df.Group==n,"Time"] = d
    return df

In [13]:
observations_default = fix_group_number_time(observations_default)
observations_oc = fix_group_number_time(observations_oc)

Running these new dataframes through *same_group_diff_time* should yield empty lists:

In [14]:
print("Any more mismatched time for Default?", same_group_diff_time(observations_default))
print("Any more mismatched time for OC-SORT?", same_group_diff_time(observations_oc))

Any more mismatched time for Default? []
Any more mismatched time for OC-SORT? []


And that is our time problem solved and validated.

In [15]:
observations_oc

Unnamed: 0,Time,Group,Participant,Success,Invested,Photo,Emotion,false_alarm,Notes,Survey,I1,I2,I3
0,2023-07-27 12:02:21.965,0,0,0,0,0,N,0,,0,,,
1,2023-07-27 12:02:33.701,1,0,0,0,0,N,0,,0,,,
2,2023-07-27 12:02:40.055,2,0,0,0,0,N,0,,0,,,
3,2023-07-27 12:02:51.530,3,0,0,0,0,N,0,,0,,,
4,2023-07-27 12:04:14.732,4,0,0,0,0,N,0,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,2023-07-31 13:56:31.603,382,0,0,0,0,N,0,,0,,,
421,2023-07-31 13:57:03.727,383,0,0,0,0,N,0,,0,,,
422,2023-07-31 13:57:45.812,384,0,0,0,0,N,0,,0,,,
423,2023-07-31 13:58:02.550,385,0,0,0,0,N,0,,0,,,


## Can non-participants be successful, invested, or filled out the survey?

The answer is no. So we will check:

In [16]:
def participant_check(df, name="Default"):
    sis = (df["Success"] + df["Invested"] + df["Survey"]) != 0
    bad = df[(df.Participant==0) & sis]
    if len(bad) > 0: print("You have invalid data for " + name)
    return bad

In [17]:
pcd = participant_check(observations_default)
pco = participant_check(observations_oc, "OC-SORT")

Good, no issues here.

## Is the sum of Survey actually equal to the number of rows from survey dataframes?

In [18]:
print("Survey consistent for Default?", observations_default.Survey.sum() == len(default))
print("Survey consistent for OC-SORT?", observations_oc.Survey.sum() == len(oc))

Survey consistent for Default? True
Survey consistent for OC-SORT? True


## Exporting

In [19]:
os.path.isdir("cleaned")

True

In [20]:
fn = "cleaned"
if not os.path.isdir(fn):
    os.makedirs(fn)
    print("Folder named", fn, "created successfully!")

observations_default.to_csv(os.path.join(fn,"observations_default.csv"), index=False)
observations_oc.to_csv(os.path.join(fn,"observations_oc.csv"), index=False)
default.to_csv(os.path.join(fn,"default.csv"), index=False)
oc.to_csv(os.path.join(fn,"oc.csv"), index=False)