This note book analyzes different procedures for identifying fact checker categorical responses.

Specifically for covid and non_covid articles

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import operator
from tqdm import tqdm
from scipy import stats
import copy
from collections import Counter
from scipy.stats import multinomial
import random
import pickle
from matplotlib_venn import venn3,venn3_circles

import statsmodels.api as sm

import seaborn as sns
sns.set(style="ticks", color_codes=True)

##adjust your master directory below

laptop = True

#in order to extract info from the links
try:
    import tldextract
except ImportError:
    pass

if laptop:
    local_path = '/Users/williamgodel/Google Drive/Grad School/RA/Tucker/'
else:
    local_path = '/Users/wpg205/Google Drive/Grad School/RA/Tucker/'

master_directory = 'crowdsourced-fact-checking'

### Main home for crowd source ML project ###

this_project_home = local_path + master_directory + '/code/full_survey_code/crowd_source_code/'


#Actual survey orig
full_orig = '/data/full_survey_control_orig'

#actual survey clean
full_clean = '/data/full_survey_control_clean'

figures = '/figures'

pickles = local_path+ master_directory +  '/code/pickles/'

#Codebook and outside data
codebook_outside = '/data/codebooks_outside_data'

full_clean_path = local_path + master_directory + full_clean

full_clean_headline  = local_path + master_directory + '/data/full_survey_headline_clean'

full_clean_article_no_source  = local_path + master_directory + '/data/full_survey_standard_article_clean'

#path to use to store clean data round one 
full_data_path = local_path + master_directory + full_orig

#path to use to store clean data from round two
full_data_path_clean = local_path + master_directory + full_clean
 
#path to store figures in
figures_path = local_path + master_directory + figures

my_figs = figures_path + '/will_figs'

#path to store pickles in 
pickles_path = local_path + master_directory + pickles

#data folder
data_path = local_path + master_directory + "/data"

%config InlineBackend.figure_format = 'retina'

#Evidence Experiement clean
evidence_exp_clean = '/data/full_survey_Search_Evidence_Exp_clean'

#Location for experiment evidence clean
evidence_exp_data_path_clean = local_path + master_directory + evidence_exp_clean

#path to use to store clean data from round two
paid_clean = '/data/full_survey_pay_clean'

full_data_pay_clean = local_path + master_directory + paid_clean

os.chdir(full_data_path)

In /Users/williamgodel/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/williamgodel/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/williamgodel/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /Users/williamgodel/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/williamgodel/anaconda3

In [2]:
#loads the data 
article_all_covid = pickle.load( open( pickles + 'article_all_covid.p', "rb" ) )
article_dic_covid = pickle.load( open( pickles + 'article_dic_covid.p', "rb" ) )


In [3]:
out_list = []
leave_dic = {} #this list stores which articles went to what.

for this_key in article_all_covid.keys():
    
    responses = article_all_covid[this_key]
    
    responses = responses[:6]
    
    while None in responses:
        
        responses = responses[:len(responses)-1]
        
    resp_count_out = Counter(responses)
    
    resp_array = np.array(list(resp_count_out.values()))
    
    if len(resp_array) != 1:
    
        resp_array.sort()
                          
    
    if len(resp_array) == 1:
        
        out_list.append(list(resp_count_out.keys())[0])
        
        cur_list = leave_dic.get(list(resp_count_out.keys())[0], [])
        
        cur_list.append(this_key)
        
        leave_dic[list(resp_count_out.keys())[0]] = cur_list
        
    
    elif resp_array[-1] - 1 <= resp_array[-2]:
        
        out_list.append("S")
        
        cur_list = leave_dic.get("S", [])
        
        cur_list.append(this_key)
        
        leave_dic["S"] = cur_list
        
    else:
        
        for keys_count, nums_count in resp_count_out.items():
            
            if nums_count == np.max(resp_array):
                
                out_list.append(keys_count)
                
                cur_list = leave_dic.get(keys_count, [])
                
                cur_list.append(this_key)
                
                leave_dic[keys_count] = cur_list
        
    
        
    
    
    

In [5]:
#  article_dic
# 1: ('CND', 2.17, 'nov 13', 1)

#format

#date, type, macro_number, mode, robust_mode(yes or no or None), no_contradiction_true, all_true, any_true
# no_contradiction_false, any_false, all_false


false_leave_one_out = set(leave_dic['f'])

#leave one out false
true_leave_one_out = set(leave_dic['t'])


all_list = []

for this_key in article_all_covid.keys():
    
    mode_est, _ , date, art_type = article_dic_covid[this_key]
    
    responses = article_all_covid[this_key]
    
    responses = responses[:6]
    
    while None in responses:
        
        responses = responses[:len(responses)-1]
    
    resp_count_out = Counter(responses)
    
    common = resp_count_out.most_common()
    
    if mode_est == "CND" or mode_est == "No Mode!":
        
        robust_mode = None
        
    else:
        
        robust_mode = (this_key in false_leave_one_out or this_key in true_leave_one_out)

    
    output_list = [date, art_type, this_key, mode_est, robust_mode]
    output_list.extend(article_all_covid[this_key])
    
    all_list.append(output_list)
    
    

col_list = ['date', 'article_type',"article_number","mode", "robust_mode", "FC1","FC2","FC3","FC4","FC5","FC6","FC1_likert","FC2_likert","FC3_likert","FC4_likert","FC5_likert","FC6_likert",]
all_data = pd.DataFrame(all_list, columns = col_list)

In [7]:
all_data.to_csv(full_clean_path + "/fact_checkers_byarticle_all_sets_covid.csv")

In [6]:
all_data.head()

Unnamed: 0,date,article_type,article_number,mode,robust_mode,FC1,FC2,FC3,FC4,FC5,FC6,FC1_likert,FC2_likert,FC3_likert,FC4_likert,FC5_likert,FC6_likert
0,May 28,1,200,FM,True,f,f,f,c,f,c,2,2,3,3,2,2
1,May 28,2,201,FM,True,c,t,f,f,f,f,3,5,3,2,1,2
2,May 28,3,202,FM,True,f,t,f,t,f,f,4,5,5,6,4,3
3,May 28,4,203,T,True,t,t,t,t,t,t,7,6,7,6,6,7
4,May 28,5,204,T,True,t,t,t,t,t,t,7,6,7,7,7,7
