Author of code: William Godel 

Date: 07/02

Purpose: to take in prepared data, then subset crowds to high political knowledge crowds. It also does this for high CRT crowds, but this data is ultimately not used. 

Data IN: 

train_data_large.csv

val_data_large.csv

train_data_large_covid.csv

val_data_large_covid.csv

train_data_large_noncovid.csv

val_data_large_noncovid.csv

test_data_large.csv

test_data_large_covid.csv

test_data_large_noncovid.csv

Data OUT:

pol_knowledge_df.p

pol_knowledge_df_test.p

pol_knowledge_df_val.p

pol_knowledge_df_train_orig.p


Machine: My laptop or Imac

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
laptop = True
import pickle

import logging
logging.basicConfig(filename='ML.log',level=logging.DEBUG)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV



from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from functions import count_mode, bayes_probs, bayes_binary

from sklearn.model_selection import train_test_split

from path import *

# Qualified crowds

In [2]:
# Data import
train_data = pd.read_csv(prepared_data + "train_data_large.csv")
val_data = pd.read_csv(prepared_data + "val_data_large.csv")

train_data_covid = pd.read_csv(prepared_data + "train_data_large_covid.csv")
val_data_covid = pd.read_csv(prepared_data + "val_data_large_covid.csv")

train_data_noncovid = pd.read_csv(prepared_data + "train_data_large_noncovid.csv")
val_data_noncovid = pd.read_csv(prepared_data + "val_data_large_noncovid.csv")


train_data_large = train_data.append(train_data_covid)
train_data_large = train_data_large.append(train_data_noncovid)
train_data_large.reset_index(inplace = True, drop = True)


val_data_large = val_data.append(val_data_covid)
val_data_large = val_data_large.append(val_data_noncovid)
val_data_large.reset_index(inplace = True, drop = True)

#first time saving target to save for orig
train_target_large = train_data_large['mode'] == 'FM'
val_target_large = val_data_large['mode'] == 'FM'
train_data_large_orig = train_data_large.copy()
train_target_large_orig = train_target_large.copy()

train_data_large = train_data_large.append(val_data_large)
train_target_large = train_target_large.append(val_target_large)
#second time saving target for combined train and val
train_data_large.reset_index(inplace = True, drop = True)
train_target_large = train_data_large['mode'] == 'FM'


test_data = pd.read_csv(prepared_data + "test_data_large.csv")
test_data_covid = pd.read_csv(prepared_data + "test_data_large_covid.csv")
test_data_noncovid = pd.read_csv(prepared_data + "test_data_large_noncovid.csv")

test_data_large = test_data.append(test_data_covid)
test_data_large = test_data_large.append(test_data_noncovid)

test_data_large.reset_index(inplace = True, drop = True)
test_target_large = test_data_large['mode'] == 'FM'


### Train + Val combined

In [3]:

#First, identify rows that are eligibe up to 15.

#All respondent columns
ideo_cols = ["ideology_resp_" +str(x) for x in range(0,60)]
resp_cols = ["resp_cat_" +str(x) for x in range(0,60)]
ver_cols = ["resp_veracity_" +str(x) for x in range(0,60)]

pol_cols = ["pol_know_" +str(x) for x in range(0,60)]
crt_cols = ["crt_" +str(x) for x in range(0,60)]


crowd_size_long = 10

def select_crowd_pol(input_array, num_size = 10, threshold = 4):
    
    keep_list = []
    count = 0
    
    for this_person in input_array:
        
        if count >= num_size:
            
            keep_list.append(False)
        
        elif this_person >= threshold:
            
            keep_list.append(True)
            
            count += 1
        
        else:
            
            keep_list.append(False)
            
    if count < num_size:
        
        #print("short", threshold)
        pass
    
    return keep_list
    
    
mask1 = train_data_large[pol_cols].apply(select_crowd_pol, num_size = crowd_size_long, axis = 1)
mask2 = train_data_large[crt_cols].apply(select_crowd_pol, num_size = crowd_size_long, threshold = 2, axis = 1)

df_list_pol = []
df_list_crt = []

for number, series in train_data_large.iterrows():
    
    df_row_pol = []
    df_row_crt = []

    mask_list1 = mask1[number]
    mask_list2 = mask2[number]
    
    df_row_pol.extend(list(series[resp_cols][mask_list1]))
    df_row_pol.extend(list(series[ver_cols][mask_list1]))
    df_row_pol.extend(list(series[ideo_cols][mask_list1]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_pol.append(df_row_pol)
    
    if len(list(series[resp_cols][mask_list2])) > 0:
    
        df_row_crt.extend(list(series[resp_cols][mask_list2]))
        df_row_crt.extend(list(series[ver_cols][mask_list2]))
        df_row_crt.extend(list(series[ideo_cols][mask_list2]))
        #df_list.append(list(series[pol_cols][mask_list1]))
        #df_list.append(list(series[crt_cols][mask_list1]))

        df_list_crt.append(df_row_crt)
    
col_list = []
col_list.extend(resp_cols[:crowd_size_long])
col_list.extend(ver_cols[:crowd_size_long])
col_list.extend(ideo_cols[:crowd_size_long])
    
pol_knowledge_df = pd.DataFrame(df_list_pol, columns = col_list )       
pol_knowledge_df['mode'] = train_data_large['mode']

crt_df = pd.DataFrame(df_list_crt, columns = col_list )       
crt_df['mode'] = train_data_large['mode'].iloc[:crt_df.shape[0]]


eval_types = ['in_robust_mode', 'no_false','all_true_set', 'any_true_set', 'no_true', 'all_false_set','any_false_set']

for this_type in eval_types:
    
    pol_knowledge_df[this_type] = train_data_large[this_type]
    crt_df[this_type] = train_data_large[this_type].iloc[:crt_df.shape[0]]

In [4]:
pickle.dump(pol_knowledge_df, open(data_pickles + "pol_knowledge_df.p", "wb"))
#pickle.dump(crt_df, open(data_pickles + "crt_df.p", "wb"))

### Test

In [5]:
crowd_size_long = 10
mask1_test = test_data_large[pol_cols].apply(select_crowd_pol, num_size = crowd_size_long, axis = 1)
mask2_test = test_data_large[crt_cols].apply(select_crowd_pol, num_size = crowd_size_long, threshold = 2, axis = 1)

df_list_pol_test = []
df_list_crt_test = []

for number, series in test_data_large.iterrows():
    
    df_row_pol = []
    df_row_crt = []

    mask_list1 = mask1_test[number]
    mask_list2 = mask2_test[number]
    
    df_row_pol.extend(list(series[resp_cols][mask_list1]))
    df_row_pol.extend(list(series[ver_cols][mask_list1]))
    df_row_pol.extend(list(series[ideo_cols][mask_list1]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_pol_test.append(df_row_pol)
    
    if len(list(series[resp_cols][mask_list2])) > 0:
    
        df_row_crt.extend(list(series[resp_cols][mask_list2]))
        df_row_crt.extend(list(series[ver_cols][mask_list2]))
        df_row_crt.extend(list(series[ideo_cols][mask_list2]))
        #df_list.append(list(series[pol_cols][mask_list1]))
        #df_list.append(list(series[crt_cols][mask_list1]))

        df_list_crt_test.append(df_row_crt)
    
col_list = []
col_list.extend(resp_cols[:crowd_size_long])
col_list.extend(ver_cols[:crowd_size_long])
col_list.extend(ideo_cols[:crowd_size_long])
    
pol_knowledge_df_test = pd.DataFrame(df_list_pol_test, columns = col_list )       
pol_knowledge_df_test['mode'] = test_data_large['mode']

crt_df_test = pd.DataFrame(df_list_crt_test, columns = col_list )       
crt_df_test['mode'] = test_data_large['mode'].iloc[:crt_df_test.shape[0]]


eval_types = ['in_robust_mode', 'no_false','all_true_set', 'any_true_set', 'no_true', 'all_false_set','any_false_set']

for this_type in eval_types:
    
    pol_knowledge_df_test[this_type] = test_data_large[this_type]
    crt_df_test[this_type] = test_data_large[this_type].iloc[:crt_df_test.shape[0]]


In [6]:
pickle.dump(pol_knowledge_df_test, open(data_pickles + "pol_knowledge_df_test.p", "wb"))
#pickle.dump(crt_df_test, open(data_pickles + "crt_df_test.p", "wb"))

### Val

In [7]:
crowd_size_long = 10
mask1_test = val_data_large[pol_cols].apply(select_crowd_pol, num_size = crowd_size_long, axis = 1)
mask2_test = val_data_large[crt_cols].apply(select_crowd_pol, num_size = crowd_size_long, threshold = 1, axis = 1)

df_list_pol_val = []
df_list_crt_val = []

for number, series in val_data_large.iterrows():
    
    df_row_pol = []
    df_row_crt = []

    mask_list1 = mask1_test[number]
    mask_list2 = mask2_test[number]
    
    df_row_pol.extend(list(series[resp_cols][mask_list1]))
    df_row_pol.extend(list(series[ver_cols][mask_list1]))
    df_row_pol.extend(list(series[ideo_cols][mask_list1]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_pol_val.append(df_row_pol)
    
    df_row_crt.extend(list(series[resp_cols][mask_list2]))
    df_row_crt.extend(list(series[ver_cols][mask_list2]))
    df_row_crt.extend(list(series[ideo_cols][mask_list2]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_crt_val.append(df_row_crt)
    
col_list = []
col_list.extend(resp_cols[:crowd_size_long])
col_list.extend(ver_cols[:crowd_size_long])
col_list.extend(ideo_cols[:crowd_size_long])
    
pol_knowledge_df_val = pd.DataFrame(df_list_pol_val, columns = col_list )       
pol_knowledge_df_val['mode'] = val_data_large['mode']

crt_df_val = pd.DataFrame(df_list_crt_val, columns = col_list )       
crt_df_val['mode'] = val_data_large['mode']



In [8]:
pickle.dump(pol_knowledge_df_val, open(data_pickles + "pol_knowledge_df_val.p", "wb"))
#pickle.dump(crt_df_val, open(data_pickles + "crt_df_val.p", "wb"))

### Pure Train

In [9]:
 
mask1 = train_data_large_orig[pol_cols].apply(select_crowd_pol, num_size = crowd_size_long, axis = 1)
mask2 = train_data_large_orig[crt_cols].apply(select_crowd_pol, num_size = crowd_size_long, threshold = 1, axis = 1)

df_list_pol = []
df_list_crt = []

for number, series in train_data_large_orig.iterrows():
    
    df_row_pol = []
    df_row_crt = []

    mask_list1 = mask1[number]
    mask_list2 = mask2[number]
    
    df_row_pol.extend(list(series[resp_cols][mask_list1]))
    df_row_pol.extend(list(series[ver_cols][mask_list1]))
    df_row_pol.extend(list(series[ideo_cols][mask_list1]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_pol.append(df_row_pol)
    
    df_row_crt.extend(list(series[resp_cols][mask_list2]))
    df_row_crt.extend(list(series[ver_cols][mask_list2]))
    df_row_crt.extend(list(series[ideo_cols][mask_list2]))
    #df_list.append(list(series[pol_cols][mask_list1]))
    #df_list.append(list(series[crt_cols][mask_list1]))
    
    df_list_crt.append(df_row_crt)
    
col_list = []
col_list.extend(resp_cols[:crowd_size_long])
col_list.extend(ver_cols[:crowd_size_long])
col_list.extend(ideo_cols[:crowd_size_long])
    
pol_knowledge_df_orig = pd.DataFrame(df_list_pol, columns = col_list )       
pol_knowledge_df_orig['mode'] = train_data_large['mode']

crt_df_orig = pd.DataFrame(df_list_crt, columns = col_list )       
crt_df_orig['mode'] = train_data_large['mode']



In [10]:
pickle.dump(pol_knowledge_df_orig, open(data_pickles + "pol_knowledge_df_train_orig.p", "wb"))
#pickle.dump(crt_df_test, open(data_pickles + "crt_df_train_orig.p", "wb"))