Author of code: William Godel 

Date: 07/02

Purpose: to generate alternative data distributions

### Data IN: 

test_set.p

test_set_noncovid.p

test_set_covid.p

article_dic.p

article_dic_covid.p

article_dic_noncovid.p

article_all.p

article_all_covid.p

article_all_noncovid.p

response_dic.p

response_dic_covid.p

response_dic_noncovid.p

### Data OUT:

test_data_98.csv

test_data_90.csv

test_data_75.csv

test_data_10.csv

test_data_98_highpol.csv

test_data_90_highpol.csv

test_data_75_highpol.csv

test_data_10_highpol.csv



Machine: My laptop or Imac

## Alternative Simulations for the test set

## this notebook creates alternative test sets for testing in different environments

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import operator
from tqdm import tqdm
from scipy import stats
import copy
from collections import Counter
from scipy.stats import multinomial
import random
import pickle

import statsmodels.api as sm

import seaborn as sns
sns.set(style="ticks", color_codes=True)

##adjust your master directory below

laptop = True

#in order to extract info from the links
from path import *

In [3]:
def draw_sample(df, art, num = 1):
    
    #dictionary to convert partisanship to numbers
    partisan_dic = {'Conservative':2,
                     'Extremely Conservative':3,
                     'Extremely Liberal':-3,
                     "Haven't thought much about it": 0, #treating these the same as moderate (for now)
                     'Liberal':-2,
                     'Moderate: Middle of the road':0,
                     'Slightly Conservative':1,
                     'Slightly Liberal':-1}
    
    samples = df.sample(n = num, replace=True).copy()
    
    output = []
    
    response_list_col = list(samples.loc[:,'eval_col'])
    response_list_veracity = list(samples.loc[:,'eval_likert'])
    response_list_ideology = list(samples.loc[:,'Q_Ideology'])
    response_list_ideology = [partisan_dic[x] for x in response_list_ideology]
    resp_pol_know = list(samples.loc[:,'pol_knowledge'])
    
    
    if art > 199:
        
        samples['tot_crt'] = np.nan

    resp_crt = list(samples.loc[:,'tot_crt'])
    
    output.extend(response_list_col)
    output.extend(response_list_veracity)
    output.extend(response_list_ideology)
    output.extend(resp_pol_know)
    output.extend(resp_crt)
    
    return output

In [4]:
test_set = pickle.load(open(data_pickles + "test_set.p", "rb"))
test_set_noncovid = pickle.load(open(data_pickles + "test_set_noncovid.p", "rb"))
test_set_covid = pickle.load(open(data_pickles + "test_set_covid.p", "rb"))

In [5]:
article_dic = pickle.load( open( data_pickles + 'article_dic.p', "rb" ))  
article_dic_covid = pickle.load(open( data_pickles + 'article_dic_covid.p', "rb" ))  
article_dic_noncovid = pickle.load( open( data_pickles + 'article_dic_noncovid.p', "rb" ))  

In [6]:
article_all = pickle.load( open( data_pickles + 'article_all.p', "rb" ))  
article_all_covid = pickle.load(open( data_pickles + 'article_all_covid.p', "rb" ))  
article_all_noncovid = pickle.load( open( data_pickles + 'article_all_noncovid.p', "rb" ))  

article_all.update(article_all_covid)
article_all.update(article_all_noncovid)

In [7]:
response_dic = pickle.load( open(data_pickles + "response_dic.p", "rb"))
response_dic_covid = pickle.load( open(data_pickles + "response_dic_covid.p", "rb"))
response_dic_noncovid = pickle.load( open(data_pickles + "response_dic_noncovid.p", "rb"))

response_dic.update(response_dic_covid)
response_dic.update(response_dic_noncovid)

In [8]:
False_set = set()
Not_false_set = set()


for article in test_set:
    if article_dic[article][0] == "FM":
        
        False_set.add(article)
    else:
        
        Not_false_set.add(article)
        
for article in test_set_covid:
    if article_dic_covid[article][0] == "FM":
        
        False_set.add(article)
    else:
        
        Not_false_set.add(article)
        
for article in test_set_noncovid:
    if article_dic_noncovid[article][0] == "FM":
        
        False_set.add(article)
    else:
        
        Not_false_set.add(article)
        
        
article_dic.update(article_dic_covid)
article_dic.update(article_dic_noncovid)

False_list = list(False_set)
Not_false_list = list(Not_false_set)


In [9]:
#First, sorting articles

num_resp= 60

num_fc = 6

col_list_df = []

types = ['resp_cat_', "resp_veracity_","ideology_resp_","pol_know_","crt_", "fc_cat_", "fc_veracity_"]

for it in range(num_resp):
    
    col_name = types[0] + str(it)
    col_list_df.append(col_name)
    
for it in range(num_resp):
    
    col_name = types[1] + str(it)
    col_list_df.append(col_name)

for it in range(num_resp):
    
    col_name = types[2] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_resp):
    
    col_name = types[3] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_resp): 
    
    col_name = types[4] + str(it)
    col_list_df.append(col_name)
    
    
for it in range(num_fc):
    
    col_name = types[5] + str(it)
    col_list_df.append(col_name)
    
for it in range(num_fc):
    
    col_name = types[6] + str(it)
    col_list_df.append(col_name)
    
    
    
col_list_df.append("article_num")

col_list_sets = ["mode"]

col_list_df.extend(col_list_sets)


In [10]:
## Next, generating test data.  - 98% Not False
np.random.seed(42)

num_training_obs = len(test_set)*500

num_responses = 60

all_sims = []

percentage_true = .98

num_false_obs = int((1 - percentage_true)*num_training_obs)
num_notfalse_obs = int(percentage_true*num_training_obs)

for this_iter in range(num_notfalse_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(Not_false_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    
    
for this_iter in range(num_false_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(False_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    

    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    

test_data_98 = pd.DataFrame(all_sims, columns = col_list_df) 
test_data_98 = test_data_98.sample(frac = 1)
test_data_98.reset_index(inplace = True, drop = True)

test_data_98.to_csv(prepared_data + "test_data_98.csv", index = False)

In [11]:
## Next, generating test data.  - 90% Not False
np.random.seed(42)

num_training_obs = len(test_set)*500

num_responses = 60

all_sims = []

percentage_true = .90

num_false_obs = int((1 - percentage_true)*num_training_obs)
num_notfalse_obs = int(percentage_true*num_training_obs)

for this_iter in range(num_notfalse_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(Not_false_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    
    
for this_iter in range(num_false_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(False_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    

    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    

test_data_90 = pd.DataFrame(all_sims, columns = col_list_df) 
test_data_90 = test_data_90.sample(frac = 1)
test_data_90.reset_index(inplace = True, drop = True)

test_data_90.to_csv(prepared_data + "test_data_90.csv", index = False)

In [12]:
## Next, generating test data.  - 90% Not False
np.random.seed(42)

num_training_obs = len(test_set)*500

num_responses = 60

all_sims = []

percentage_true = .75

num_false_obs = int((1 - percentage_true)*num_training_obs)
num_notfalse_obs = int(percentage_true*num_training_obs)

for this_iter in range(num_notfalse_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(Not_false_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    
    
for this_iter in range(num_false_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(False_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    

    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    

test_data_75 = pd.DataFrame(all_sims, columns = col_list_df) 
test_data_75 = test_data_75.sample(frac = 1)
test_data_75.reset_index(inplace = True, drop = True)

test_data_75.to_csv(prepared_data + "test_data_75.csv", index = False)

In [13]:
## Next, generating test data.  - 90% Not False
np.random.seed(42)

num_training_obs = len(test_set)*500

num_responses = 60

all_sims = []

percentage_true = .1

num_false_obs = int((1 - percentage_true)*num_training_obs)
num_notfalse_obs = int(percentage_true*num_training_obs)

for this_iter in range(num_notfalse_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(Not_false_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    
    
for this_iter in range(num_false_obs):
    
    this_sim = []
    
    the_article_num = np.random.choice(False_list, size = 1, replace = False)[0]
        
    #the responses
    this_df = response_dic[the_article_num]
    
    #fc response
    mode_est, _ , date, art_type = article_dic[the_article_num]
      
    #drawing a sample
    resp_list = draw_sample(this_df, the_article_num, num = num_responses)
    
    #getting fact checker information
    fc_list = article_all[the_article_num]
    
    this_sim.extend(resp_list)
    this_sim.extend(fc_list)
    this_sim.append(int(the_article_num))
    

    
    output_list = [mode_est]
    
    this_sim.extend(output_list)
    
    all_sims.append(this_sim)
    

test_data_10 = pd.DataFrame(all_sims, columns = col_list_df) 
test_data_10 = test_data_10.sample(frac = 1)
test_data_10.reset_index(inplace = True, drop = True)

test_data_10.to_csv(prepared_data + "test_data_10.csv", index = False)

### Generating high Pol knowledge out of these

In [14]:
### Now generating High pol version

#First, identify rows that are eligibe up to 15.

#All respondent columns
ideo_cols = ["ideology_resp_" +str(x) for x in range(0,60)]
resp_cols = ["resp_cat_" +str(x) for x in range(0,60)]
ver_cols = ["resp_veracity_" +str(x) for x in range(0,60)]

pol_cols = ["pol_know_" +str(x) for x in range(0,60)]
crt_cols = ["crt_" +str(x) for x in range(0,60)]


crowd_size_long = 10

def select_crowd_pol(input_array, num_size = 10, threshold = 4):
    
    keep_list = []
    count = 0
    
    for this_person in input_array:
        
        if count >= num_size:
            
            keep_list.append(False)
        
        elif this_person >= threshold:
            
            keep_list.append(True)
            
            count += 1
        
        else:
            
            keep_list.append(False)
            
    if count < num_size:
        
        #print("short", threshold)
        pass
    
    return keep_list

files = [test_data_98, test_data_90, test_data_75, test_data_10]
names = ["test_data_98", "test_data_90", "test_data_75", "test_data_10"]


for data_num, data in enumerate(files):

    mask1 = data[pol_cols].apply(select_crowd_pol, num_size = crowd_size_long, axis = 1)

    df_list_pol = []

    for number, series in data.iterrows():

        df_row_pol = []

        mask_list1 = mask1[number]

        df_row_pol.extend(list(series[resp_cols][mask_list1]))
        df_row_pol.extend(list(series[ver_cols][mask_list1]))
        df_row_pol.extend(list(series[ideo_cols][mask_list1]))
        #df_list.append(list(series[pol_cols][mask_list1]))
        #df_list.append(list(series[crt_cols][mask_list1]))

        df_list_pol.append(df_row_pol)

    col_list = []
    col_list.extend(resp_cols[:crowd_size_long])
    col_list.extend(ver_cols[:crowd_size_long])
    col_list.extend(ideo_cols[:crowd_size_long])

    pol_knowledge_df = pd.DataFrame(df_list_pol, columns = col_list )       
    pol_knowledge_df['mode'] = data['mode']
    
    pol_knowledge_df.to_csv(prepared_data + names[data_num] + "_highpol.csv" )

In [15]:
names = ["test_data_98", "test_data_90", "test_data_75", "test_data_10"]
names[0] + "_highpol.csv"

'test_data_98_highpol.csv'