# Exploratory Analysis

In [1]:
import pandas as pd
import numpy as np

pd.set_option('max_colwidth', None)
pd.set_option('display.max_columns', None)
import warnings 
warnings.filterwarnings("ignore")

In [2]:
results = pd.read_csv('data/1345_customer_results.csv', index_col = "Case ID")

In [3]:
results.head(2)

Unnamed: 0_level_0,Origin,Origin Created At,Content ID,URL,Labeling State,Series,Series Index,Patch,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight,Internal Notes,Comments,Explanation
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5888087,https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B14_time139137_medoid_vote2.jpg,Jul 26 2021 21:41:47 PM,3264386,https://go.centaurlabs.com/problem/5888087,Gold Standard,,,,2,'no','no',0.0,1.0,'no',2,1.54,'yes',0,0.0,,[],
5888088,https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B10_time178433_medoid_vote0.jpg,Jul 26 2021 21:41:47 PM,3264387,https://go.centaurlabs.com/problem/5888088,Gold Standard,,,,3,'no','no',0.0,1.0,'no',3,2.34,'yes',0,0.0,,[],


---
### Cleaning the Data

- Throwing out columns with invalid vote numbers
- Creating an expert vote column
- Removing useless columns

In [4]:
# Filtering out the rows without a vote number
vote_exists = results['Origin'].str.fullmatch(r'(.*)vote[0-8](.*)', case=False)
vote_exists.value_counts()

True     27000
False     3293
Name: Origin, dtype: int64

In [5]:
# Dropping the rows without vote number
results_vote_exists = results[vote_exists]

# Adding expert vote column
results_vote_exists['N Expert Affirmation'] = results_vote_exists['Origin'].str.extract(r'vote(\d)').astype(int)
results_vote_exists.head(2)

Unnamed: 0_level_0,Origin,Origin Created At,Content ID,URL,Labeling State,Series,Series Index,Patch,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Answer,First Choice Votes,First Choice Weight,Second Choice Answer,Second Choice Votes,Second Choice Weight,Internal Notes,Comments,Explanation,N Expert Affirmation
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
5888087,https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B14_time139137_medoid_vote2.jpg,Jul 26 2021 21:41:47 PM,3264386,https://go.centaurlabs.com/problem/5888087,Gold Standard,,,,2,'no','no',0.0,1.0,'no',2,1.54,'yes',0,0.0,,[],,2
5888088,https://centaur-customer-uploads.s3.us-east-1.amazonaws.com/mgh-eeg/210721/jpg/B10_time178433_medoid_vote0.jpg,Jul 26 2021 21:41:47 PM,3264387,https://go.centaurlabs.com/problem/5888088,Gold Standard,,,,3,'no','no',0.0,1.0,'no',3,2.34,'yes',0,0.0,,[],,0


In [6]:
# Filtering out useless columns
reduced = results_vote_exists[['Labeling State', 'Qualified Reads', 'Correct Label', 'Majority Label', 'Difficulty', 'Agreement', 'First Choice Weight', 'Second Choice Weight', 'N Expert Affirmation']]

In [7]:
reduced.head(2)

Unnamed: 0_level_0,Labeling State,Qualified Reads,Correct Label,Majority Label,Difficulty,Agreement,First Choice Weight,Second Choice Weight,N Expert Affirmation
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5888087,Gold Standard,2,'no','no',0.0,1.0,1.54,0.0,2
5888088,Gold Standard,3,'no','no',0.0,1.0,2.34,0.0,0


**Columns dropped:**

- First choice answer, votes
- Second choice answer, votes
- Qualitative columns besides Labeling State

**Explanation:** In dropping these columns, no useful information is lost. The first choice answer and second choice answer can be gained form the Majority Label. The number of votes can be obtained by comparing Qualified Reads with Agreement. The qualitative columns cannot be used in a meaningful way for analysis.



---
### Adding Potentially Useful Columns

- **Adding an expert agreement column**
    - Renaming crowd agreement column to avoid confusion
    
This should help with comparisons between the crowd's confidence and the experts' confidence.

- **Adding a consensus column for whether or not the crowd and experts agree**

This should make it easier to compare whether the crowd agrees with the expert consensus more than individual experts do (our objective).

In [8]:
# Adding expert agreement column
reduced['Expert Agreement'] = ''
greater = reduced['N Expert Affirmation'] > 4
less = reduced['N Expert Affirmation'] < 4
even = reduced['N Expert Affirmation'] == 4
reduced['Expert Agreement'][greater] = reduced['N Expert Affirmation'][greater] / 8
reduced['Expert Agreement'][less] = 1 - (reduced['N Expert Affirmation'][less] / 8)
reduced['Expert Agreement'][even] = 0.5

In [15]:
# Adding consensus column
reduced['Consensus'] = ''
consensus = reduced['Correct Label'] == reduced['Majority Label']
disagreements = ~consensus
reduced['Consensus'][consensus] = 'yes'
reduced['Consensus'][disagreements] = 'no'

In [18]:
reduced.head()

Unnamed: 0_level_0,Labeling State,Qualified Reads,Correct Label,Majority Label,Difficulty,Crowd Agreement,First Choice Weight,Second Choice Weight,N Expert Affirmation,Expert Agreement,Consensus
Case ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5888087,Gold Standard,2,'no','no',0.0,1.0,1.54,0.0,2,0.75,yes
5888088,Gold Standard,3,'no','no',0.0,1.0,2.34,0.0,0,1.0,yes
5888089,Gold Standard,2,'no','no',0.0,1.0,1.7,0.0,0,1.0,yes
5888090,Gold Standard,1,'no','no',0.0,1.0,0.82,0.0,0,1.0,yes
5888091,In Progress,7,,'yes',,0.571,3.28,2.32,4,0.5,no


In [17]:
reduced = reduced.rename(columns={'Agreement':'Crowd Agreement'})

---
### Brief Analysis

In [11]:
expert_ties = reduced[reduced['Labeling State'] == 'In Progress']
crowd_ties = reduced[reduced['Crowd Agreement'] == 0.5]
crowd_agreement = np.mean(expert_ties['Crowd Agreement'])
expert_agreement = np.mean(crowd_ties['Expert Agreement'])
print(f'{len(crowd_ties)} instances of split crowd opinions')
print(f'{len(expert_ties)} instances of split expert opinions')
print(f'{np.mean(crowd_agreement)} crowd agreement when experts were split')
print(f'{np.mean(expert_agreement)} expert agreement when crowd was split')                 

1508 instances of split crowd opinions
3000 instances of split expert opinions
0.8268833333333334 crowd agreement when experts were split
0.7514091511936339 expert agreement when crowd was split


- Crowd has split opinions less often
- Crowd is more confident when experts are split, than experts are when crowd is split

In [23]:
crowd_agree = sum(reduced['Consensus']=='yes')/len(reduced)
expert_agree = np.mean(reduced['Expert Agreement'])
print(f'Crowd agrees with expert consensus {round(crowd_agree*100,1)}% of the time')
print(f'Experts agree with the expert consensus {round(expert_agree*100,1)}% of the time')

Crowd agrees with expert consensus 69.2% of the time
Experts agree with the expert consensus 77.8% of the time


This is not what we want, but could be affected by instances where the experts or crowd are undecided. Right now these instances are marked as overall disagreements between the crowd and experts.