Author of code: William Godel 

Date: 07/02

Purpose: to convert additional noncovid original data into crowds

## Data IN: 

non_Covid_fact_checkers_summary_clean.csv
non_Covid_fact_checkers_clean.csv
full_survey_noncovid_control_clean.csv

## Data OUT: 

### Data tables

train_data_large_noncovid.csv

val_data_large_noncovid.csv

test_data_large_noncovid.csv


### dictionaries

article_all_noncovid.p

article_dic_noncovid.p

response_dic_noncovid.p

### Sets 

test_set_noncovid.p

val_set_noncovid.p

train_set_noncovid.p


Machine: My laptop or Imac

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import operator
from tqdm import tqdm
from scipy import stats
import copy
from collections import Counter
from scipy.stats import multinomial
import random
import pickle

import statsmodels.api as sm

import seaborn as sns
sns.set(style="ticks", color_codes=True)

##adjust your master directory below
from path import *

In [2]:
answers = pd.read_csv(source_data + "non_Covid_fact_checkers_summary_clean.csv", index_col = 0) 
answers_individual = pd.read_csv(source_data + "non_Covid_fact_checkers_clean.csv", index_col = 0) 
control_data = pd.read_csv(source_data + "full_survey_noncovid_control_clean.csv", index_col = 0, low_memory=False)




rel_likert = ['clean_True_Likert_Scale_1',
             'clean_True_Likert_Scale_2',
             'clean_True_Likert_Scale_3',
             'clean_True_Likert_Scale_4',
             'clean_True_Likert_Scale_5']

fact_likert = ['Likert_1_mean',
              'Likert_2_mean',
              'Likert_3_mean',
              'Likert_4_mean',
              'Likert_5_mean']

#just getting the answer
for this_col in rel_likert:
    
    control_data[this_col] = control_data[this_col].str[0]

#not - asked as nan
for this_col in rel_likert:
    
    control_data.loc[control_data[this_col] == 'n', this_col] = np.nan
    
    control_data[this_col] = control_data[this_col].astype(float)

In [3]:
#Identifying if users were correct or not based on model fact checker
#delete if not used later


control_data.reset_index(inplace = True, drop = True)

eval_cols = ['clean_Q13_Eval_1',
             'clean_Q13_Eval_2',
             'clean_Q13_Eval_3',
             'clean_Q13_Eval_4',
             'clean_Q13_Eval_5']


vote_list = ['Eval_1_mode',
             'Eval_2_mode',
             'Eval_3_mode',
             'Eval_4_mode',
             'Eval_5_mode']

correct_cols = ['correct_1',
               'correct_2',
               'correct_3',
               'correct_4',
               'correct_5']

control_data['correct_1'] = np.nan
control_data['correct_2'] = np.nan
control_data['correct_3'] = np.nan
control_data['correct_4'] = np.nan
control_data['correct_5'] = np.nan

answer_dic = {'CND':'Determine', 'T':'factually accurate', 'FM':'Misleading'}

for user in range(control_data.shape[0]):
    
    for judge_num, judge_col in enumerate(eval_cols):
        
        if control_data.loc[user,judge_col] == 'not asked': #if the user didn't answer -> skip it
            continue
        elif control_data.loc[user,vote_list[judge_num]]  == 'No Mode!': #if fact checkers don't agree -> skip it
            continue
        else:
            
            fact_answer = control_data.loc[user,vote_list[judge_num]]
            user_answer_iftrue = answer_dic[fact_answer]
            
            if user_answer_iftrue in control_data.loc[user,judge_col]:
                
                control_data.loc[user,correct_cols[judge_num]] = 1
            
            else:
                
                control_data.loc[user,correct_cols[judge_num]] = 0


In [4]:
control_data['pol_knowledge'] = control_data['Q_PK_1'].isin(["Democratic Party"])*1 + control_data['Q_PK_3'].isin(["Nancy Pelosi"])*1 + control_data['Q_PK_6'].isin(["Prime Minister of the United Kingdom"])*1 + control_data['Q_PK_7'].isin(["Michael Pompeo"])*1

#control_data['tot_crt'] = control_data['Q_1st_CRT'].str.contains("2", regex = False, case = False)*1 + control_data['Q_2nd_CRT'].isin(["7"])*1 + control_data['Q_3rd_CRT'].str.contains("emily", regex = False, case = False)*1 + control_data['Q_4th_CRT'].str.contains("0|no dirt|zero|none|nothing", regex = True, case = False)*1


In [5]:
# Creating an article dictionary

total_count = 301

#storing article level information by article number
article_dic = {}

#article number maps to 
#fact checker eval, the day, and the type (1,2,3,4,5)
        
control_agg = control_data.groupby(['day'])[rel_likert].mean()

these_days = set(control_agg.index)

for this_it in answers.index:
    
    for col_num, col_mode in enumerate(vote_list):
        
        fact_eval = answers.loc[this_it,col_mode]
        
        fact_veracity = np.round(answers.loc[this_it,fact_likert[col_num]], decimals = 2)
        
        article_dic[total_count] = (fact_eval,fact_veracity,this_it,col_num + 1)
        
        total_count += 1

In [6]:
# a useful function to convert answers
def convert_answer(a_str):
    
    if "are evaluating is factually accurate" in a_str.lower():
        return "t"
    elif "true, false, or misleading" in a_str.lower():
        return "c"
    elif "misleading and/or false" in a_str.lower():
        return "f"

In [7]:
#1: ('CND', 2.17, 'nov 13', 1),

#getting EVERY fact checker response by article
article_all = {}

for key in article_dic.keys():
    
    new_list = []
    
    _,_,day,col = article_dic[key]
    
    sub_ind = answers_individual[answers_individual.day == day]
    
    cat_list = list(sub_ind[eval_cols[col - 1]])
    cat_list = [convert_answer(x) for x in cat_list]
    
    ver_resp = list(sub_ind[rel_likert[col - 1]].str[0])
    
    while len(cat_list) < 6:
        cat_list.append(None)
        
    while len(ver_resp) < 6:
        ver_resp.append(None)
    
    new_list.extend(cat_list)
    new_list.extend(ver_resp)

    article_all[key] = new_list



In [8]:
#saving this
pickle.dump(article_all, open( data_pickles + 'article_all_noncovid.p', "wb" ))  
pickle.dump(article_dic, open( data_pickles + 'article_dic_noncovid.p', "wb" ))  




In [9]:
### Creating a df article dictionary
### basically look up each article num and then pull all the responses

response_dic = {}

for key_it in article_dic.keys():
    
    this_day = article_dic[key_it][2]
    
    article_col = article_dic[key_it][3]
    #print(this_day in control_data.day.unique())
    if this_day in control_data.day.unique():
       
        control_data_day = control_data[control_data.day == this_day].copy()
        
        control_data_day['eval_col'] = control_data_day[eval_cols[article_col - 1]]
        
        control_data_day = control_data_day[control_data_day['eval_col'] != 'not asked']
        
        #translate responses
        control_data_day['eval_col'] = control_data_day['eval_col'].apply(convert_answer)
        
        control_data_day['eval_likert'] = control_data_day[rel_likert[article_col - 1]]
        
        response_dic[key_it] = control_data_day


In [10]:
pickle.dump(response_dic, open(data_pickles + "response_dic_noncovid.p", "wb"))

In [11]:
all_size = 0

for the_key in response_dic.keys():

    all_size += response_dic[the_key].shape[0]
    
all_size

2769

In [12]:
# This function draws a sample answer given a dataframe
#it returns a list that has

#for a total of num
#each categorical evaluation
#likert evaluation
#political score for that individual

def draw_sample(df, num = 1):
    
    #dictionary to convert partisanship to numbers
    partisan_dic = {'Conservative':2,
                     'Extremely Conservative':3,
                     'Extremely Liberal':-3,
                     "Haven't thought much about it": 0, #treating these the same as moderate (for now)
                     'Liberal':-2,
                     'Moderate: Middle of the road':0,
                     'Slightly Conservative':1,
                     'Slightly Liberal':-1}
    
    samples = df.sample(n = num, replace=True).copy()
    
    output = []
    
    response_list_col = list(samples.loc[:,'eval_col'])
    response_list_veracity = list(samples.loc[:,'eval_likert'])
    response_list_ideology = list(samples.loc[:,'Q_Ideology'])
    response_list_ideology = [partisan_dic[x] for x in response_list_ideology]
    resp_pol_know = list(samples.loc[:,'pol_knowledge'])
    
    samples['tot_crt'] = np.nan
    resp_crt = list(samples.loc[:,'tot_crt'])
    
    output.extend(response_list_col)
    output.extend(response_list_veracity)
    output.extend(response_list_ideology)
    output.extend(resp_pol_know)
    output.extend(resp_crt)
    
    return output

# Generating train and test data

## First step is divide articles into Training and test

## false and true are determined by mode of fact checkers

In [13]:
def article_remover(set_input):
    
    return_set = []
    
    for article in set_input:
        
        if 0 < article % 5 < 4:
            
            return_set.append(article)
            
        
    return set(return_set)
    

In [14]:
false_list = []
true_list = []
CND_list = []

for this_key in article_dic.keys():
    
    rating, _, _, _ = article_dic[this_key]
    
    
    if rating == 'FM':
        
        false_list.append(this_key)
        
    elif rating == 'CND':
        
        CND_list.append(this_key)
        
    else:
        
        true_list.append(this_key)
        
false_list = article_remover(false_list)
true_list = article_remover(true_list)
CND_list = article_remover(CND_list)

false_set = set(false_list)
true_set = set(true_list)
cnd_set  = set(CND_list)
        
        

In [15]:
### NOTE - Correct Cols are created below!
correct_cols = ['correct_1',
                'correct_2',
                'correct_3',
                'correct_4',
                'correct_5']

ratio_list = []
count_list = []
sum_list = []

for x in correct_cols:
    
    count = control_data.groupby('day')[x].count()
    sum_tot = control_data.groupby('day')[x].sum()
    ratio = sum_tot/count
    
    count_list.append(count)
    sum_list.append(sum_tot)
    ratio_list.append(ratio)

In [16]:
# Creating an article dictionary for the percetage correct by article

vote_list = ['Eval_1_mode',
             'Eval_2_mode',
             'Eval_3_mode',
             'Eval_4_mode',
             'Eval_5_mode']

eval_cols = ['clean_Q13_Eval_1',
             'clean_Q13_Eval_2',
             'clean_Q13_Eval_3',
             'clean_Q13_Eval_4',
             'clean_Q13_Eval_5']

total_count = 300

#storing article level information by article number
diff_dic = {}

#article number maps to 
#fact checker eval, the day, and the type (1,2,3,4,5)


for this_it in answers.index[:]:
    
    for col_num, col_mode in enumerate(vote_list):
        
        diff_dic[total_count] = (ratio_list[col_num][this_it], this_it, col_num+1,col_mode)
        
        total_count += 1
        
        #control_mode = control_data.groupby(['day'])[eval_cols].agg(lambda x: tuple(stats.mode(x)))
        

In [17]:
def article_sorter(a_set, the_dic):
    
    a_list = []
    
    for article in a_set:
        
        a_list.append(list((article,the_dic[article][0])))
        
    a_list.sort(key = lambda x: x[1])
    
    evens = list(range(0,len(a_list),2))
    odds = list(range(1,len(a_list),2))
    
    train_list = [a_list[x][0] for x in evens]
    test_list = [a_list[x][0] for x in odds]
    
    assert len(set(test_list).intersection(set(train_list))) == 0
    
        
    return test_list, train_list

In [18]:
def article_sorter_adv(a_set,percent, the_dic):
    #percent is for the number to go to the smaller of two sets
    
    num_small = round(len(a_set)*percent) 
    
    if num_small == 0:
        
        test_list = []
        train_list = list(a_set)
    
        return test_list, train_list
    
    
    a_list = [(article,the_dic[article][0]) for article in a_set]
    
    a_list.sort(key = lambda x: x[1])
    
    evens = list(range(0,len(a_list),2))
    odds = list(range(1,len(a_list),2))
    
    test_list = []
    train_list = []

    step = round(len(a_set)/num_small)

    for num, obj in enumerate(a_set):

            if num % step == 0:

                test_list.append(obj)
                
            else:
                
                
                train_list.append(obj)  
                
    while len(test_list) > num_small:
        
        hold = test_list.pop()
        train_list.append(hold)
        
    while len(train_list) > (len(a_set) - num_small):
        
        hold = train_list.pop()
        test_list.append(hold)
    
    assert len(set(test_list).intersection(set(train_list))) == 0
    
    assert len(train_list) + len(test_list)  == len(a_set)
    
        
    return test_list, train_list

In [19]:
np.random.seed(42)
test_false, train_false = article_sorter_adv(false_set,.2,diff_dic)
test_true, train_true = article_sorter_adv(true_set,.2,diff_dic)

val_false, train_false = article_sorter_adv(train_false,.2,diff_dic)
val_true, train_true = article_sorter_adv(train_true,.2,diff_dic)


test_set = test_false.copy()
test_set.extend(test_true)

train_set = train_false.copy()
train_set.extend(train_true)

val_set = val_false.copy()
val_set.extend(val_true)

#residual articles
all_articles_set = set(article_all.keys())
all_articles_set = article_remover(all_articles_set)
all_articles_set = all_articles_set - set(false_set) - set(true_set)

no_assign_test, no_assign_train = article_sorter_adv(all_articles_set,.2,diff_dic)
no_assign_val, no_assign_train = article_sorter_adv(no_assign_train,.2,diff_dic)

test_set.extend(no_assign_test)
train_set.extend(no_assign_train)
val_set.extend(no_assign_val)

In [20]:
pickle.dump(test_set, open(data_pickles + "test_set_noncovid.p", "wb"))
pickle.dump(val_set, open(data_pickles + "val_set_noncovid.p", "wb"))
pickle.dump(train_set, open(data_pickles + "train_set_noncovid.p", "wb"))

In [21]:
len(test_set) + len(train_set) + len(val_set)

18

In [22]:
len(false_set)

8

In [23]:
len(val_false)


1

In [24]:
len(val_true)

1

In [25]:
len(test_false)

2

In [26]:
len(test_true)

2

In [27]:
5 + 5+ 1+ 1+ + 2 +2

16

# Generating training and test set
## Here, generating column names for the fake data I am constructing

In [28]:
#First, sorting articles

num_resp= 60

num_fc = 6

col_list_df = []

types = ['resp_cat_', "resp_veracity_","ideology_resp_","pol_know_","crt_", "fc_cat_", "fc_veracity_"]

for it in range(num_resp):
    
    col_name = types[0] + str(it)
    col_list_df.append(col_name)
    
for it in range(num_resp):
    
    col_name = types[1] + str(it)
    col_list_df.append(col_name)

for it in range(num_resp):
    
    col_name = types[2] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_resp):
    
    col_name = types[3] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_resp): 
    
    col_name = types[4] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_fc):
    
    col_name = types[5] + str(it)
    col_list_df.append(col_name)
    
for it in range(num_fc):
    
    col_name = types[6] + str(it)
    col_list_df.append(col_name)
    
    
    
col_list_df.append("article_num")

col_list_sets = ["mode","in_robust_mode","no_false", \
           "all_true_set", "any_true_set","no_true","all_false_set","any_false_set"]

col_list_df.extend(col_list_sets)


# How to use training and test sets

1. All articles that are in one of the six sets (three for false and three for true) is cleanly split between training and test, with unassigned articles also randomly split.

2. For a specific balance in training/test, they can then be resampled at the appropriate weights given whatever criteria (mode, robust mode, etc) is being used, to create a more balanced training and test set.

# Constructing Training Set

In [29]:
## Next, generating training data. 
np.random.seed(42)

num_training_obs = len(train_set)*500

num_responses = num_resp

all_sims = []

for this_iter in range(num_training_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(train_set, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    robust_mode = None
    
    no_contradiction_true = np.nan
    all_true = np.nan
    any_true = np.nan
    
    no_contradiction_false = np.nan
    all_false = np.nan
    any_false = np.nan
    
    output_list = [mode_est, robust_mode]
    output_list_true = [no_contradiction_true,all_true,any_true]
    output_list_false = [no_contradiction_false, all_false, any_false]
    
    output_list.extend(output_list_true)
    output_list.extend(output_list_false)
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)

#training data
train_data = pd.DataFrame(all_sims, columns = col_list_df) 


In [30]:
## Next, generating val data. 
np.random.seed(42)

num_training_obs = len(val_set)*500

num_responses = num_resp

all_sims = []

for this_iter in range(num_training_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(val_set, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    robust_mode = None

    no_contradiction_true = np.nan
    all_true = np.nan
    any_true = np.nan
    
    no_contradiction_false = np.nan
    all_false = np.nan
    any_false = np.nan
    
    output_list = [mode_est, robust_mode]
    output_list_true = [no_contradiction_true,all_true,any_true]
    output_list_false = [no_contradiction_false, all_false, any_false]
    
    output_list.extend(output_list_true)
    output_list.extend(output_list_false)
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)

#training data
val_data = pd.DataFrame(all_sims, columns = col_list_df) 


In [31]:
## Next, generating test data. 
np.random.seed(42)

num_test_obs = len(test_set)*500

num_responses = num_resp

all_sims = []

for this_iter in range(num_test_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(test_set, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    robust_mode = None
    
    no_contradiction_true = np.nan
    all_true = np.nan
    any_true = np.nan
    
    no_contradiction_false = np.nan
    all_false = np.nan
    any_false = np.nan
    
    output_list = [mode_est, robust_mode]
    output_list_true = [no_contradiction_true,all_true,any_true]
    output_list_false = [no_contradiction_false, all_false, any_false]
    
    output_list.extend(output_list_true)
    output_list.extend(output_list_false)
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)

#training data
test_data = pd.DataFrame(all_sims, columns = col_list_df) 


In [32]:
#saving data
train_data.to_csv(prepared_data + "train_data_large_noncovid.csv", index = False)
val_data.to_csv(prepared_data + "val_data_large_noncovid.csv", index = False)
test_data.to_csv(prepared_data + "test_data_large_noncovid.csv", index = False)

