Author of code: William Godel 

Date: 07/02

Purpose: to calculate the heuristic performance of crowds on test data

## Data IN: 

test_data_large.csv

test_data_large_covid.csv

test_data_large_noncovid.csv

train_data_large.csv

val_data_large.csv

train_data_large_covid.csv

val_data_large_covid.csv

train_data_large_noncovid.csv

val_data_large_noncovid.csv


## Data OUT: 


crowd_size_binary_test.p

partisan_crowd_size_binary_test.p

unanimous_crowd_size_binary_test.p

unanimous_partisan_crowd_size_binary_test.p

bayes_crowds_types_test.p

Machine: My laptop or Imac

## This notebook evaluates various transparent heuristic models - applied to TEST set /

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
import statsmodels.api as sm
from scipy import stats
laptop = True
import pickle

from functions import bayes_binary, bayes_probs,crowd_mode,eval_comp,quick_graph, full_evaluation, crowd_mode, see_data, array_return, count_ideo_func


from path import *

In [2]:
#train_data = pd.read_csv("train_data_large.csv")
#test_data = pd.read_csv("test_data_large.csv")

In [3]:

test_data = pd.read_csv(prepared_data + "test_data_large.csv")
test_data_covid = pd.read_csv(prepared_data + "test_data_large_covid.csv")
test_data_noncovid = pd.read_csv(prepared_data + "test_data_large_noncovid.csv")

combined_data_fakesource_test = test_data.copy()
combined_data_fakesource_test = combined_data_fakesource_test.append(test_data_covid)
combined_data_fakesource_test = combined_data_fakesource_test.append(test_data_noncovid)

train_data = pd.read_csv(prepared_data + "train_data_large.csv")
val_data = pd.read_csv(prepared_data + "val_data_large.csv")

train_data_covid = pd.read_csv(prepared_data + "train_data_large_covid.csv")
val_data_covid = pd.read_csv(prepared_data + "val_data_large_covid.csv")

train_data_noncovid = pd.read_csv(prepared_data + "train_data_large_noncovid.csv")
val_data_noncovid = pd.read_csv(prepared_data + "val_data_large_noncovid.csv")


combined_data_fakesource_train = train_data.copy()
combined_data_fakesource_train = combined_data_fakesource_train.append(val_data)
combined_data_fakesource_train = combined_data_fakesource_train.append(train_data_covid)
combined_data_fakesource_train = combined_data_fakesource_train.append(val_data_covid)
combined_data_fakesource_train = combined_data_fakesource_train.append(train_data_noncovid)
combined_data_fakesource_train = combined_data_fakesource_train.append(val_data_noncovid)

combined_data_fakesource_train.reset_index(inplace = True, drop = True)




In [4]:
## Identifying fake source
mod_series = combined_data_fakesource_train["article_num"] % 5 
train_data_fakesource = combined_data_fakesource_train[mod_series.between(1, 3)].copy()
train_data_fakesource.reset_index(inplace = True, drop = True)

mod_serie_test = combined_data_fakesource_test["article_num"] % 5 
test_data_fakesource = combined_data_fakesource_test[mod_serie_test.between(1, 3)].copy()
test_data_fakesource.reset_index(inplace = True, drop = True)


# Crowd size simulation

## Only Questionable sources

# Categories

For false according to FC:

1. fcmodefalse = pure mode false or not
2. rmfalse = robust mode false or not
3. allfalse = all false or not
4. anyfalse = any false set or not
5. notrue = no true said against it or not

## and then reverse for true

Crowd Mode types:

1. origmode = pure crowd mode
2. nocnd = cnd discarded
3. modefm = all cnd counted as fm 

In [5]:
### constructing the combos of interest

fc_types = ['fcmodefalse',"rmfalse","allfalse","anyfalse","notrue","alltrue",'fcmodetrue',"rmtrue","anytrue","nofalse"]
crowd_types = ["origmode","nocnd","modefm"]

types_to_cols= {"allfalse":"all_false_set",
              "anyfalse":"any_false_set",
              "notrue":"no_true",
              "alltrue":"all_true_set",
              "anytrue":"any_true_set",
              "nofalse":"no_false"}


In [6]:
#creating the dictionary to save it all in
crowd_size_binary = {}

for fc_type_it in fc_types:
    
    for crowd_type_it in crowd_types:
        
        key = fc_type_it + "_" + crowd_type_it
        
        crowd_size_binary[key] = []


min_crowd = 1
max_crowd = 26

data_df = test_data_fakesource

for crowd_size in range(min_crowd,max_crowd):

    #Doing the same for all binary types
    for this_type in fc_types:
        
        #one subset
        data_subset = data_df.iloc[:,:crowd_size]
        
        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[0])
        crowd_size_binary[this_type + "_" + crowd_types[0]].append(all_info)

       
        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[1])
        crowd_size_binary[this_type + "_" + crowd_types[1]].append(all_info)


        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[2])
        crowd_size_binary[this_type + "_" + crowd_types[2]].append(all_info)

        

In [7]:

pickle.dump(crowd_size_binary, open( heuristic_data + 'crowd_size_binary_test.p', "wb" ))

# Partisan crowds

### Crowds of 3,6, 9, 12, 15

for each row - find the partisan cols
then count answers along the axis


notes:

make a note for jonathan

follow up on misleading or false 

In [8]:
#First, identify rows that are eligibe up to 15.

#columns with ideology
ideo_cols = ["ideology_resp_" +str(x) for x in range(0,30)]
resp_cols = ["resp_cat_" +str(x) for x in range(0,30)]


#want to only keep rows with sufficient partisan diversity
col_ideo_counts = test_data_fakesource.loc[:,ideo_cols].apply(Counter, axis = 1)
ideo_keeper_rows = col_ideo_counts.apply(count_ideo_func)

#subsetting to those with the minimum number
test_data_fakesource_partisan = test_data_fakesource[ideo_keeper_rows].copy()
test_data_fakesource_partisan.reset_index(inplace = True, drop = True)

In [9]:
#creating the dictionary to save it all in
partisan_crowd_size_binary = {}

for fc_type_it in fc_types:
    
    for crowd_type_it in crowd_types:
        
        key = fc_type_it + "_" + crowd_type_it
        
        partisan_crowd_size_binary[key] = []


min_crowd = 1
max_crowd = 6

data_df = test_data_fakesource_partisan

for crowd_size in range(min_crowd,max_crowd):

    #Doing the same for all binary types
    for this_type in fc_types:
        
        #one subset
        
        #creating a dataframe of what to keep

        mask1 = data_df[ideo_cols].apply(array_return, length = crowd_size, axis = 1)
        new_df = pd.DataFrame.from_dict(dict(zip(mask1.index, mask1.values))).T
        
        
        response_df = data_df[resp_cols].to_numpy()
        reponsonse_counter = np.ma.masked_array(data = response_df, mask = ~new_df.to_numpy())
        answers_counter  = pd.DataFrame(reponsonse_counter).apply(Counter, axis = 1)
    
        data_subset = answers_counter
        
        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[0], do_count1 = True)
        partisan_crowd_size_binary[this_type + "_" + crowd_types[0]].append(all_info)

       
        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[1], do_count1 = True)
        partisan_crowd_size_binary[this_type + "_" + crowd_types[1]].append(all_info)


        all_info = full_evaluation(data_subset,data_df,this_type,crowd_types[2], do_count1 = True)
        partisan_crowd_size_binary[this_type + "_" + crowd_types[2]].append(all_info)

        

In [10]:
pickle.dump(partisan_crowd_size_binary, open( heuristic_data + 'partisan_crowd_size_binary_test.p', "wb" ))

# Unaniminity Rule
# first - random crowds

In [11]:
#creating the dictionary to save it all in
unanimous_crowd_size_binary = {}

for fc_type_it in fc_types:

    key = fc_type_it + "_unanimous" 

    unanimous_crowd_size_binary[key] = []


min_crowd = 2
max_crowd = 10

data_df = test_data_fakesource

for crowd_size in range(min_crowd,max_crowd):

    #Doing the same for all binary types
    for this_type in fc_types:
        
        #one subset
        data_subset = data_df.iloc[:,:crowd_size]
        
        all_info = full_evaluation(data_subset,data_df,this_type,"unan")
        unanimous_crowd_size_binary[this_type + "_unanimous"].append(all_info)

        

In [12]:
pickle.dump(unanimous_crowd_size_binary, open( heuristic_data + 'unanimous_crowd_size_binary_test.p', "wb" ))

## Now - parisan crowds but unanimity rule

In [13]:
#creating the dictionary to save it all in
unanimous_partisan_crowd_size_binary = {}

for fc_type_it in fc_types:

    key = fc_type_it + "_unanimous" 

    unanimous_partisan_crowd_size_binary[key] = []



min_crowd = 1
max_crowd = 4 #this is a multiple

data_df = test_data_fakesource

for crowd_size in range(min_crowd,max_crowd):

    #Doing the same for all binary types
    for this_type in fc_types:
        
        #one subset
        
        #creating a dataframe of what to keep
        mask1 = data_df[ideo_cols].apply(array_return, length = crowd_size, axis = 1)
        new_df = pd.DataFrame.from_dict(dict(zip(mask1.index, mask1.values))).T
        
        response_df = data_df[resp_cols].to_numpy()
        reponsonse_counter = np.ma.masked_array(data = response_df, mask = ~new_df.to_numpy())
        answers_counter  = pd.DataFrame(reponsonse_counter).apply(Counter, axis = 1)
    
        data_subset = answers_counter
        
        all_info = full_evaluation(data_subset,data_df,this_type,"unan", do_count1 = True)
        unanimous_partisan_crowd_size_binary[this_type + "_unanimous"].append(all_info)

        

In [14]:
pickle.dump(unanimous_partisan_crowd_size_binary, open( heuristic_data + 'unanimous_partisan_crowd_size_binary_test.p', "wb" ))

# Bayes Rule

In [15]:
bayes_crowds_types = {}

min_crowd = 10
max_crowd = 25 

for fc_type_it in fc_types:

    key = fc_type_it + "_bayes" 

    bayes_crowds_types[key] = []
    

data_df = train_data_fakesource
test_df = test_data_fakesource
    
for crowd_size in range(min_crowd,max_crowd+1):
    
    for this_type in fc_types:
        
        all_info = full_evaluation(data_df,data_df,this_type,crowd_types[2], \
                                    test_data = test_df, bayes = this_type, num_resp = crowd_size)
        bayes_crowds_types[this_type + "_bayes"].append(all_info)


In [16]:
pickle.dump(bayes_crowds_types, open( heuristic_data + 'bayes_crowds_types_test.p', "wb" ))