## Import

In [1]:
import os
os.system('pip install tasknet pygrove xpflow setGPU')

0

In [2]:
import pandas as pd
import numpy as np
import tasknet as tn
from tqdm.notebook import tqdm
from dataclasses import dataclass
tqdm.pandas()
import random
from datasets import DatasetDict
import datasets
from torch.utils.data import Dataset
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoTokenizer
import inspect
import torch
import transformers
from types import MethodType
os.environ["WANDB_DISABLED"] = "true"
os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
import tensorflow as tf
from collections import Counter

import funcy as fc

In [3]:
tf.config.list_physical_devices('GPU')

[]

In [4]:
import importlib
importlib.reload(tn)

<module 'tasknet' from 'c:\\Users\\Quent\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\tasknet\\__init__.py'>

# Data Preprocessing

## Preprocessing Methods

In [5]:
@dataclass
class DataCollatorForPolls:
    tokenizer: None=None
    tokenizer_kwargs: None=None
    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        pad_args=inspect.signature(self.tokenizer.pad).parameters.keys()
        batch = self.tokenizer.pad(flattened_features, **fc.project(self.tokenizer_kwargs,pad_args))
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.stack(labels).float() #### Allows one value per choice
        return batch

In [6]:
def get_demographics(x):
    parenthesized=[s[s.find("(")+1:s.rfind(")")]  for s in x.options.keys()]
    parenthesized=[s for s in parenthesized if s]
    parenthesized_count=pd.Series.value_counts(parenthesized)
    return list(parenthesized_count[parenthesized_count>1].index)

def split_questions(x):
    demographics=get_demographics(x)

    l=[]
    for d in demographics:
        l+=[fc.project(x.options, [x for x in x.options if f'({d})' in x])] #if row has multiple demographics its repeated in dataset?
    if l:
        x.options=l
        x['split']=True
    else:
        x.options=[x.options]
        x['split']=False
    return x

def extract_demographic(x):
    demographics = get_demographics(x)
    x['demographic']=''
    if not demographics:
        return x
    if len(demographics)>1:
        print('failure:',demographics, x.options)
        return x
    demographic=demographics[0]
    x['options'] = {k.replace(f'({demographic})',''):v for (k,v) in x.options.items()}
    x['demographic']=demographic
    return x

# def remove_result(x):
#   newDict={}
#   for key,value in x.items():
#     if "result" not in key.lower():
#       newDict[key] = value
#   return newDict

# def normalize_votes(item):
#   labels = item["options"]
#   normalizationConstant = sum(labels.values())
#   if(normalizationConstant != 0):
#     newDict = {}
#     for (key,value) in item["options"].items():
#       newDict[key] = round(Decimal((value/normalizationConstant)),2)
#     item["options"] = newDict
#     return item
  
num_choices=6
choices_names=[f'choice_{i}' for i in range(num_choices)]
mode='proba'

def preprocess(x):
    choices = (list(x.options.keys())+['']*num_choices)[:num_choices]
    for i,n in enumerate(choices_names):
        x[n]=choices[i]
    scores = np.array(list(x.options.values())+[-1]*num_choices)[:num_choices]*1.0
    if mode=='binary':
        x['label'] = np.random.choice(np.flatnonzero(scores == scores.max()))
    else:
        scores[scores!=-1]= scores[scores!=-1]/scores[scores!=-1].sum()
        x['label'] = scores
    del x['options']
    x['task']=0
    return x

def apply_question_template(df):
  #we need to take into account that the input to the model is Demographic + Question + Answer
  # we thus need to create a question template that seems natural
  # This template is only necessary if the input is not already in a question format
  demographic = df["demographic"]
  category = df["demographic_category"]
  subject = df["input"]
  result = any(item in subject for item in ["?","How","Why","Where","When","Who","What"])
  sentence = f" {demographic}: {subject} ?".lower()
  if(not result):
    if(category == "age"):
        sentence = f" {demographic} years old: which of these options would you take in regard to the subject '{subject} ?'".lower()  
    else:
        sentence = f" {demographic}: which of these options would you take in regard to the subject '{subject} ?'".lower()  
  df['question'] = sentence
  return df

def apply_question_template_without_demo(df):
  #we need to take into account that the input to the model is Demographic + Question + Answer
  # we thus need to create a question template that seems natural
  # This template is only necessary if the input is not already in a question format
  subject = df["input"]
  result = any(item in subject for item in ["?","How","Why","Where","When","Who","What"])
  sentence = f" {subject} ?".lower()
  if(not result):
    sentence = f"which of these options would you take in regard to the subject '{subject} ?'".lower()  
  df['question'] = sentence
  return df

def filter_nb_answers(data):
  dataset = []
  keys = list(data[["input","demographic","year"]].value_counts().keys())
  for (input,demographic,year) in keys:
      results = data.groupby(["input","demographic","year"]).get_group((input,demographic,year))
      if( len(results["label"]) > 1 and len(results["label"]) <= 6):
        options = {}
        for i in range(len(results["choice"])):
          options[list(results["choice"])[i]] = list(results["label"])[i]

        dataset.append({"input":input,"demographic":demographic,"year":year,"options":options})
  return pd.DataFrame(dataset) 





def choices(x):
    choices = (x["options"]+['']*num_choices)[:num_choices]
    scores = np.array(list(x.labels)+[-1]*num_choices)[:num_choices]*1.0
    for i,n in enumerate(choices_names):
        x[n]=choices[i]  
    x["labels"] = scores
    return x

def filter_data(x):
    options = x["options"]
    if(1 < len(options) <7):
        return x

def get_split(x):
    rnd=random.Random(x).random()
    if rnd<0.95:
        return 'train'
    if rnd<0.975:
        return 'validation'
    else:
        return 'test'

## Reddit

In [62]:
url = 'https://sileod.s3.us-west-004.backblazeb2.com/r_poll_all.json'
df_reddit= pd.read_json('https://sileod.s3.us-west-004.backblazeb2.com/r_poll_all.json')

In [63]:
df_reddit['inputs']=df_reddit.title
len(df_reddit)

312306

In [64]:
df_reddit['options']=df_reddit.options.map(lambda x:{k:v for k,v in x.items() if k!='Results'})
df_reddit['options'].sample(frac=1.0)
df_reddit=df_reddit[df_reddit['options'].map(len)>1]
len(df_reddit)
df_reddit['n_answers']=df_reddit.options.map(dict.values).map(sum)
df_reddit=df_reddit[df_reddit.n_answers>20]
len(df_reddit)

307568

In [65]:
if 'dem' not in url:
    df_reddit=df_reddit.progress_apply(split_questions,axis=1).explode('options')
    df_reddit=df_reddit.progress_apply(extract_demographic,axis=1)
    #df_reddit['inputs']=df_reddit.demographic+': '+df_reddit.title
df_reddit_demographics=df_reddit[df_reddit.demographic.map(len).map(bool)]
df_reddit_demographics.head(3), len(df_reddit_demographics)

  0%|          | 0/307568 [00:00<?, ?it/s]

  0%|          | 0/317731 [00:00<?, ?it/s]

failure: ['165 cm', "165 cm) - 6' (183 cm"] {'< 5\' 5" (165 cm), Male': 14, '< 5\' 5" (165 cm), Female': 34, '5\' 5" (165 cm) - 6\' (183 cm), Male': 106, '5\' 5" (165 cm) - 6\' (183 cm), Female': 36}
failure: ['183 cm', "165 cm) - 6' (183 cm"] {'5\' 5" (165 cm) - 6\' (183 cm), Male': 106, '5\' 5" (165 cm) - 6\' (183 cm), Female': 36, "> 6' (183 cm), Male": 65, "> 6' (183 cm), Female": 4}


(        id                                              title  \
 1   yvae7l  Do You Believe Your Congressional District is ...   
 1   yvae7l  Do You Believe Your Congressional District is ...   
 65  yuipdn                            Does body count matter?   
 
                     options     created  \
 1   {'Yes ': 35, 'No ': 17}  1668455219   
 1     {'Yes ': 7, 'No ': 8}  1668455219   
 65  {'Yes ': 13, 'No ': 33}  1668382874   
 
                                                inputs  n_answers  split  \
 1   Do You Believe Your Congressional District is ...        134   True   
 1   Do You Believe Your Congressional District is ...        134   True   
 65                            Does body count matter?        168   True   
 
    demographic  
 1            D  
 1            R  
 65      Female  ,
 20069)

In [66]:
counter = [x for x in Counter(df_reddit_demographics["demographic"]).keys() if Counter(df_reddit_demographics["demographic"])[x] >= 10]
df_reddit_demographics = df_reddit_demographics[df_reddit_demographics["demographic"].isin(counter)]

In [67]:
df_reddit_demographics_normalized = df_reddit_demographics.progress_apply(preprocess,axis=1)

  0%|          | 0/11561 [00:00<?, ?it/s]

  scores[scores!=-1]= scores[scores!=-1]/scores[scores!=-1].sum()


In [68]:
if mode=='binary':
    df_reddit_demographics_normalized=df_reddit_demographics_normalized[df_reddit_demographics_normalized.label<num_choices]
else:
    df_reddit_demographics_normalized=df_reddit_demographics_normalized[df_reddit_demographics_normalized.label.map(len)==num_choices]
len(df_reddit_demographics_normalized)

11561

In [69]:
df_reddit_demographics_normalized.dropna

<bound method DataFrame.dropna of             id                                              title     created  \
1       yvae7l  Do You Believe Your Congressional District is ...  1668455219   
65      yuipdn                            Does body count matter?  1668382874   
65      yuipdn                            Does body count matter?  1668382874   
66      yui5re                      are your nails long or short?  1668381574   
66      yui5re                      are your nails long or short?  1668381574   
...        ...                                                ...         ...   
307851  gq6x1r  Should women go to jail for false rape accusat...  1590393452   
307851  gq6x1r  Should women go to jail for false rape accusat...  1590393452   
308670  gkzer2                               Lying about your age  1589653023   
309958  gdk0w1                          Your preference in comics  1588625918   
311887  fsruvs                                 political ideology  15857121

In [70]:
df_reddit_demographics_normalized[df_reddit_demographics_normalized.label.progress_map(lambda x: any(np.isnan(a) for a in x))]
df_reddit_demographics_normalized['split']=df_reddit_demographics_normalized.title.map(get_split)

  0%|          | 0/11561 [00:00<?, ?it/s]

### Normalize demographic

#### demographic libraries

In [71]:
stop_words = ["able","about","above","abroad","according","accordingly","across","actually","adj","after","afterwards","again","against","ago","ahead","ain't","all","allow","allows","almost","alone","along","alongside","already","also","although","always","am","amid","amidst","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","a's","aside","ask","asking","associated","at","available","away","awfully","back","backward","backwards","be","became","because","become","becomes","becoming","been","before","beforehand","begin","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","came","can","cannot","cant","can't","caption","cause","causes","certain","certainly","changes","clearly","c'mon","co","co.","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","c's","currently","dare","daren't","definitely","described","despite","did","didn't","different","directly","do","does","doesn't","doing","done","don't","down","downwards","during","each","edu","eg","eight","eighty","either","else","elsewhere","end","ending","enough","entirely","especially","et","etc","even","ever","evermore","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","fairly","far","farther","few","fewer","fifth","first","five","followed","following","follows","for","forever","former","formerly","forth","forward","found","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","half","happens","hardly","has","hasn't","have","haven't","having","he","he'd","he'll","hello","help","hence","her","here","hereafter","hereby","herein","here's","hereupon","hers","herself","he's","hi","him","himself","his","hither","hopefully","how","howbeit","however","hundred","i'd","ie","if","ignored","i'll","i'm","immediate","in","inasmuch","inc","inc.","indeed","indicate","indicated","indicates","inner","inside","insofar","instead","into","inward","is","isn't","it","it'd","it'll","its","it's","itself","i've","just","k","keep","keeps","kept","know","known","knows","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","likewise","little","look","looking","looks","low","lower","ltd","made","mainly","make","makes","many","may","maybe","mayn't","me","mean","meantime","meanwhile","merely","might","mightn't","mine","minus","miss","more","moreover","most","mostly","mr","mrs","much","must","mustn't","my","myself","name","namely","nd","near","nearly","necessary","need","needn't","needs","neither","never","neverf","neverless","nevertheless","new","next","nine","ninety","no","nobody","non","none","nonetheless","noone","no-one","nor","normally","not","nothing","notwithstanding","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","one's","only","onto","opposite","or","other","others","otherwise","ought","oughtn't","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","past","per","perhaps","placed","please","plus","possible","presumably","probably","provided","provides","que","quite","qv","rather","rd","re","really","reasonably","recent","recently","regarding","regardless","regards","relatively","respectively","right","round","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","since","six","so","some","somebody","someday","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","take","taken","taking","tell","tends","th","than","thank","thanks","thanx","that","that'll","thats","that's","that've","the","their","theirs","them","themselves","then","thence","there","thereafter","thereby","there'd","therefore","therein","there'll","there're","theres","there's","thereupon","there've","these","they","they'd","they'll","they're","they've","thing","things","think","third","thirty","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","till","to","together","too","took","toward","towards","tried","tries","truly","try","trying","t's","twice","two","un","under","underneath","undoing","unfortunately","unless","unlike","unlikely","until","unto","up","upon","upwards","us","use","used","useful","uses","using","usually","v","value","various","versus","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","welcome","well","we'll","went","were","we're","weren't","we've","what","whatever","what'll","what's","what've","when","whence","whenever","where","whereafter","whereas","whereby","wherein","where's","whereupon","wherever","whether","which","whichever","while","whilst","whither","who","who'd","whoever","whole","who'll","whom","whomever","who's","whose","why","will","willing","wish","with","within","without","wonder","won't","would","wouldn't","yes","yet","you","you'd","you'll","your","you're","yours","yourself","yourselves","you've","zero","a","how's","i","when's","why's","b","c","d","e","f","g","h","j","l","m","n","o","p","q","r","s","t","u","uucp","w","x","y","z","I","www","amount","bill","bottom","call","computer","con","couldnt","cry","de","describe","detail","due","eleven","empty","fifteen","fifty","fill","find","fire","forty","front","full","give","hasnt","herse","himse","interest","itse”","mill","move","myse”","part","put","show","side","sincere","sixty","system","ten","thick","thin","top","twelve","twenty","abst","accordance","act","added","adopted","affected","affecting","affects","ah","announce","anymore","apparently","approximately","aren","arent","arise","auth","beginning","beginnings","begins","biol","briefly","ca","date","ed","effect","et-al","ff","fix","gave","giving","heres","hes","hid","home","id","im","immediately","importance","important","index","information","invention","itd","keys","kg","km","largely","lets","line","'ll","means","mg","million","ml","mug","na","nay","necessarily","nos","noted","obtain","obtained","omitted","ord","owing","page","pages","poorly","possibly","potentially","pp","predominantly","present","previously","primarily","promptly","proud","quickly","ran","readily","ref","refs","related","research","resulted","resulting","results","run","sec","section","shed","shes","showed","shown","showns","shows","significant","significantly","similar","similarly","slightly","somethan","specifically","state","states","stop","strongly","substantially","successfully","sufficiently","suggest","thered","thereof","therere","thereto","theyd","theyre","thou","thoughh","thousand","throug","til","tip","ts","ups","usefully","usefulness","'ve","vol","vols","wed","whats","wheres","whim","whod","whos","widely","words","world","youd","youre"]

In [72]:
import requests
import re

r = requests.get("https://raw.githubusercontent.com/dariusk/corpora/master/data/religion/religions.json")
religions = r.json()
beliefs = []
for major in [item for item in religions.values()]:
    majors = [m for m in major]
    beliefs.extend(majors)
    try:
        for minor in major.values():
            for religion in minor:
                beliefs.append(religion)
    except:
        next

In [73]:
r = requests.get("https://raw.githubusercontent.com/Imagin-io/country-nationality-list/master/countries.json")
dictionary = r.json()
df_dictionary = pd.DataFrame(dictionary)
def nation(item):
    item["en_short_name"] = ""+item["en_short_name"].lower()+""
    item["alpha_2_code"] = ""+item["alpha_2_code"].lower()+""
    item["alpha_3_code"] = ""+item["alpha_3_code"].lower()+""
    nationality = re.split(', | or',item["nationality"].lower())
    item["nationality"] = nationality
    return item
df_dictionary=df_dictionary.apply(nation,axis=1)

In [74]:
nationalities = []
for item in df_dictionary["nationality"].values:
    for label in item:
        nationalities.append(label)

countries = []
countries.extend(df_dictionary["en_short_name"].values)
countries.extend(df_dictionary["alpha_2_code"].values)
countries.extend(df_dictionary["alpha_3_code"].values)

#### labels

In [75]:
def normalize_nationality(x):
    male = ["man","guy"]
    label = x["demographic"].lower()
    if (2 <= len(label) <= 3) and label not in male:
      if any(token.replace(" ","").lower() in label.split(" ") for token in nationalities):
        token = [token for token  in nationalities if token.replace(" ","").lower() in label.split(" ")]
        if("not" in label) | ("non" in label):
              label = "not "+token[0]
        else:
              label = token[0]
    elif any(token.lower() in label.split(" ") for token in nationalities):
        token = [token for token  in nationalities if token.lower() in label.split(" ")]
        if("not" in label) | ("non" in label):
            label = "not "+token[0]
        else:
            label = token[0]
    x["demographic"] = label
    return x

def normalize_countries(x):
    male = ["man","guy","men"]
    label = x["demographic"].lower()
    if (2 <= len(label) <= 3) and label not in male:
      if any(token.replace(" ","").lower() in label.split(" ") for token in countries):
        token = [token for token  in countries if token.replace(" ","").lower() in label.split(" ")]
        if("not" in label) | ("non" in label):
              label = "not "+token[0]
        else:
              label = token[0]
    elif any(token.lower() in label.split(" ") for token in countries):
        token = [token for token  in countries if token.lower() in label.split(" ")]
        if("not" in label) | ("non" in label):
            label = "not "+token[0]
        else:
            label = token[0]
    x["demographic"] = label
    return x

def normalize_genders(x):
    label = x["demographic"].lower()
    female= ["f","female","woman","chick","girl"]
    male = ["m","male","man","guy","men"]
    if any(token.lower() in female for token in label.split()):
        if ("not" in label) | ("non" in label):
            label = "not female"
        else:
            label = "female"
    if any(token in male for token in label.split()):
        if("not" in label) | ("non" in label):
            label = "not male"
        else:
            label = "male"
    x["demographic"] = label
    return x

def normalize_politics(x):
    label = x["demographic"].lower()
    right = ["d","democratic","right","r",'right-leaning']
    left = ["republican","republicans","left","left-leaning"]
    if any(token.lower() in label.split(" ") for token in right):
        if("not" in label) | ("non" in label):
            label = "not democratic"
        else: 
            label = "democratic"
    elif any(token.lower() in label.split(" ") for token in left):
        if("not" in label) | ("non" in label):
            label = "not republican"
        else: 
            label = "republican"
    x["demographic"] = label
    return x

def remove_nan(x):
  labels = x["label"]
  if any(np.isnan(labels)):
      pass
  else:
      return x


In [76]:
df_reddit_demographics_normalized_labels=df_reddit_demographics_normalized.apply(normalize_nationality,axis=1)
df_reddit_demographics_normalized_labels=df_reddit_demographics_normalized_labels.apply(normalize_genders,axis=1)
df_reddit_demographics_normalized_labels=df_reddit_demographics_normalized_labels.apply(normalize_countries,axis=1)
df_reddit_demographics_normalized_labels=df_reddit_demographics_normalized_labels.apply(normalize_politics,axis=1)
df_reddit_demographics_normalized_labels = df_reddit_demographics_normalized_labels.progress_apply(remove_nan,axis=1)
df_reddit_demographics_normalized_labels.dropna()

  0%|          | 0/11561 [00:00<?, ?it/s]

Unnamed: 0,id,title,created,inputs,n_answers,split,demographic,choice_0,choice_1,choice_2,choice_3,choice_4,choice_5,label,task
1,yvae7l,Do You Believe Your Congressional District is ...,1.668455e+09,Do You Believe Your Congressional District is ...,134.0,train,democratic,Yes,No,,,,,"[0.4666666666666667, 0.5333333333333333, -1.0,...",0.0
65,yuipdn,Does body count matter?,1.668383e+09,Does body count matter?,168.0,train,female,Yes,No,,,,,"[0.2826086956521739, 0.717391304347826, -1.0, ...",0.0
65,yuipdn,Does body count matter?,1.668383e+09,Does body count matter?,168.0,train,male,Yes,No,,,,,"[0.5737704918032787, 0.4262295081967213, -1.0,...",0.0
66,yui5re,are your nails long or short?,1.668382e+09,are your nails long or short?,631.0,test,female,Long,Short,,,,,"[0.23026315789473684, 0.7697368421052632, -1.0...",0.0
66,yui5re,are your nails long or short?,1.668382e+09,are your nails long or short?,631.0,test,male,Long,Short,,,,,"[0.06680584551148225, 0.9331941544885177, -1.0...",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307851,gq6x1r,Should women go to jail for false rape accusat...,1.590393e+09,Should women go to jail for false rape accusat...,408.0,train,other,Yes,No,,,,,"[0.8, 0.2, -1.0, -1.0, -1.0, -1.0]",0.0
307851,gq6x1r,Should women go to jail for false rape accusat...,1.590393e+09,Should women go to jail for false rape accusat...,408.0,train,male,Yes,No,,,,,"[0.9072847682119205, 0.09271523178807947, -1.0...",0.0
308670,gkzer2,Lying about your age,1.589653e+09,Lying about your age,165.0,train,male,Yes,No,,,,,"[0.42276422764227645, 0.5772357723577236, -1.0...",0.0
309958,gdk0w1,Your preference in comics,1.588626e+09,Your preference in comics,97.0,train,comment,Multiple,Other,,,,,"[0.44, 0.56, -1.0, -1.0, -1.0, -1.0]",0.0


### Reco

In [77]:
df_reddit_demographics_normalized_labels_demo = df_reddit_demographics_normalized_labels.copy()
df_reddit_demographics_normalized_labels_no_demo = df_reddit_demographics_normalized_labels.copy()

In [78]:
try:
    del df_reddit_demographics_normalized_labels['options']
except:
    pass
    
df_reddit_demographics_normalized_labels_demo['labels']=df_reddit_demographics_normalized_labels_demo['label']
del df_reddit_demographics_normalized_labels_demo['label']

df_reddit_demographics_normalized_labels_no_demo['labels']=df_reddit_demographics_normalized_labels_no_demo['label']
del df_reddit_demographics_normalized_labels_no_demo['label']


df_reddit_demographics_normalized_labels_demo['inputs']=df_reddit_demographics_normalized_labels.demographic+': '+df_reddit_demographics_normalized_labels.title
dataset_reddit_demographics_plbrt=DatasetDict({k:datasets.Dataset.from_pandas(df_reddit_demographics_normalized_labels_demo[df_reddit_demographics_normalized_labels_demo['split']==k]) for k in {'train','test','validation'}})


df_reddit_demographics_normalized_labels_no_demo['inputs']=df_reddit_demographics_normalized_labels.title
dataset_reddit__plbrt=DatasetDict({k:datasets.Dataset.from_pandas(df_reddit_demographics_normalized_labels_no_demo[df_reddit_demographics_normalized_labels_no_demo['split']==k]) for k in {'train','test','validation'}})

In [79]:
reco = tn.MultipleChoice(
    dataset_reddit__plbrt,
    s1="inputs",
    y="labels",
    num_labels = num_choices,
    choices=choices_names,
    data_collator=DataCollatorForPolls()
)

tasks_reddit__plbrt = [reco]

In [80]:
tasks_reddit__plbrt[0].dataset["validation"]["inputs"]

['When’s the last time you saw a medical doctor?',
 'Have you finished a book this year?',
 'Have you finished a book this year?',
 'Should the requirement to drink and gamble be based on responsibility and knowledge instead of age?',
 'Should the requirement to drink and gamble be based on responsibility and knowledge instead of age?',
 "Do you wan't a general war?",
 "Do you wan't a general war?",
 'Would an affair who another person cheats with be more likely to also cheat in their own relationship, if they had one in the future?',
 'Would an affair who another person cheats with be more likely to also cheat in their own relationship, if they had one in the future?',
 'If being Naked was more common in public spaces, would humans in general, work out more?',
 'If being Naked was more common in public spaces, would humans in general, work out more?',
 'Who should pay for dinner? (Pick the best out of these for yoir gender, I cannot fit in a results/other option.)',
 'Who should pay f

In [81]:
reco = tn.MultipleChoice(
    dataset_reddit_demographics_plbrt,
    s1="inputs",
    y="labels",
    num_labels = num_choices,
    choices=choices_names,
    data_collator=DataCollatorForPolls()
)

tasks_reddit_demographics_plbrt = [reco]

In [82]:
tasks_reddit_demographics_plbrt[0].dataset["validation"]["inputs"]

['usa: When’s the last time you saw a medical doctor?',
 'male: Have you finished a book this year?',
 'female: Have you finished a book this year?',
 'non-american: Should the requirement to drink and gamble be based on responsibility and knowledge instead of age?',
 'american: Should the requirement to drink and gamble be based on responsibility and knowledge instead of age?',
 "america: Do you wan't a general war?",
 "europe: Do you wan't a general war?",
 'male: Would an affair who another person cheats with be more likely to also cheat in their own relationship, if they had one in the future?',
 'female: Would an affair who another person cheats with be more likely to also cheat in their own relationship, if they had one in the future?',
 'female: If being Naked was more common in public spaces, would humans in general, work out more?',
 'male: If being Naked was more common in public spaces, would humans in general, work out more?',
 'male: Who should pay for dinner? (Pick the be

## WVS

In [83]:
df_wvs = pd.read_json("https://raw.githubusercontent.com/QuentinTilman/PollBERT-paper/master/WVMCQ7/WVMCQ7-Normalized_labels_percentage_wise.json")

In [84]:
df_wvs = df_wvs.progress_apply(filter_data,axis=1)
df_wvs = df_wvs.dropna()
pd.DataFrame(df_wvs)
df_wvs= pd.DataFrame([item for item in df_wvs.values if item[1].lower() not in ["-5","-4","-3","-2","-1","no answer","don't know"] ],columns= ["year","demographic","input","demographic_category","options","labels"])

  0%|          | 0/209542 [00:00<?, ?it/s]

In [85]:
df_wvs['split']=df_wvs.input.progress_map(get_split)

  0%|          | 0/179181 [00:00<?, ?it/s]

In [86]:
df_wvs = pd.DataFrame(df_wvs)
df_wvs

Unnamed: 0,year,demographic,input,demographic_category,options,labels,split
0,2017.0,18,Aims of country: first choice,age,"[A high level of economic growth, Making sure ...","[0.4291417166, 0.1397205589, 0.3233532934, 0.1...",train
1,2017.0,18,Aims of country: second choice,age,"[A high level of economic growth, Making sure ...","[0.2611336032, 0.2064777328, 0.3198380567, 0.2...",train
2,2017.0,18,Aims of respondent: first choice,age,"[maintaining order in the nation, Giving peopl...","[0.3605577689, 0.24103585660000001, 0.14541832...",train
3,2017.0,18,Aims of respondent: second choice,age,"[maintaining order in the nation, Giving peopl...","[0.2851405622, 0.2489959839, 0.218875502000000...",train
4,2017.0,18,Believe in: God,age,"[No, Yes]","[0.3611691023, 0.6388308977]",train
...,...,...,...,...,...,...,...
179176,2022.0,male,Vote in elections: local level,gender,"[Always, Usually, Never, Not allowed to vote]","[0.5963541667000001, 0.24804687500000003, 0.14...",test
179177,2022.0,male,Which party would you vote for: first choice (...,gender,"[No right to vote, Other, Independent Candidate]","[0.36250000000000004, 0.125, 0.5125000000000001]",train
179178,2022.0,male,Willingness to fight for country,gender,"[no, yes]","[0.22697899840000002, 0.7730210016]",train
179179,2022.0,male,Work is a duty towards society,gender,"[Strongly agree, Agree, Neither agree nor disa...","[0.3264913406, 0.3688261706, 0.1610006414, 0.1...",train


### Add choice columns to dataset

In [87]:
df_wvs=df_wvs.progress_apply(choices,axis=1)

  0%|          | 0/179181 [00:00<?, ?it/s]

In [88]:
if mode=='binary':
    df_wvs=df_wvs[df_wvs.labels<num_choices]
else:
    df_wvs=df_wvs[df_wvs.labels.map(len)==num_choices]
len(df_wvs)

179181

### Apply template for uniform questions


In [89]:
df_demo = df_wvs.progress_apply(apply_question_template,axis=1)
df_no_demo =  df_wvs.progress_apply(apply_question_template_without_demo,axis=1)

  0%|          | 0/179181 [00:00<?, ?it/s]

  0%|          | 0/179181 [00:00<?, ?it/s]

### Training, Validation and Test Sets

In [90]:
df_demo[df_demo.labels.progress_map(lambda x: any(np.isnan(a) for a in x))]
df_no_demo[df_no_demo.labels.progress_map(lambda x: any(np.isnan(a) for a in x))]

  0%|          | 0/179181 [00:00<?, ?it/s]

  0%|          | 0/179181 [00:00<?, ?it/s]

Unnamed: 0,year,demographic,input,demographic_category,options,labels,split,choice_0,choice_1,choice_2,choice_3,choice_4,choice_5,question


In [91]:
def get_split(x):
    rnd=random.Random(x).random()
    if rnd<0.95:
        return 'train'
    if rnd<0.975:
        return 'validation'
    else:
        return 'test'
    
df_demo['split']=df_demo.input.map(get_split)
try:
    del df_demo['options']
except:
    pass
dataset=DatasetDict({k:datasets.Dataset.from_pandas(df_demo[df_demo['split']==k]) for k in {'train','test','validation'}})


df_no_demo['split']=df_demo['split']
try:
    del df_no_demo['options']
except:
    pass
dataset=DatasetDict({k:datasets.Dataset.from_pandas(df_no_demo[df_no_demo['split']==k]) for k in {'train','test','validation'}})

In [92]:
reco = tn.MultipleChoice(
    dataset,
    s1="question",
    y="labels",
    num_labels = num_choices,
    choices=choices_names,
    data_collator=DataCollatorForPolls()
)

tasks_wvs_demo = [reco]

In [93]:
reco = tn.MultipleChoice(
    dataset,
    s1="question",
    y="labels",
    num_labels = num_choices,
    choices=choices_names,
    data_collator=DataCollatorForPolls()
)

tasks_wvs_no_demo = [reco]

# Load Model

In [7]:
from transformers.models.roberta.modeling_roberta import *
  
        
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
        num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
        `input_ids` above)
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

    flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
    flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
    flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
    flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
    flat_inputs_embeds = (
        inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
        if inputs_embeds is not None
        else None
    )

    outputs = self.roberta(
        flat_input_ids,
        position_ids=flat_position_ids,
        token_type_ids=flat_token_type_ids,
        attention_mask=flat_attention_mask,
        head_mask=head_mask,
        inputs_embeds=flat_inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)

    logits = self.classifier(pooled_output)

    reshaped_logits = logits.view(-1, num_choices)


    m=nn.Softmax(dim=1)
    reshaped_logits=m(reshaped_logits)
    loss = None
    if labels is not None:
        loss_fct = self.loss
        loss = loss_fct(reshaped_logits, labels)
    if not return_dict:
        output = (reshaped_logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output
    
    return MultipleChoiceModelOutput(
        loss=loss,
        logits=reshaped_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
from types import MethodType 

In [8]:
from statistics import mean
def compute_metrics_old(element):
    f_distance = lambda x,y: np.abs((x-y)).max(axis=0).mean()
    y_pred,y_true = element
    distance = f_distance(y_pred,y_true)
    return distance
    
def crop(element):
  y_pred,y_true = element
  mask = y_true == -1
  return (y_pred[~mask],y_true[~mask])


def compute_metrics(self,batch):
  meta = {"name": self.name, "size": len(batch.predictions), "index": self.index}
  distance = mean([compute_metrics_old(crop(element)) for element in zip(batch.predictions,batch.label_ids)])
  return {**meta, 'distance':distance}


def mse_loss(input, target, ignored_index=-1):
    mask = target == ignored_index
    out = (input[~mask]-target[~mask])**2
    return out.mean()**.5

def bce_loss(input, target, ignored_index=-1):
  loss = nn.BCELoss()
  mask = target == ignored_index
  out = loss(input[~mask],target[~mask])
  return out

# Analysis

## Global Demographic Effect

In this part we study the effect of the demographic attribute by checking whether we can sse variations on the predictions of MCQ without Attribute and with Attribute

## Local Demographic Effect

In this part we study the effect of the demographic attribute by checking whether there is a noticeable difference in the prediction between different demographic attributes.

### Reddit

In [98]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

tokenizer = AutoTokenizer.from_pretrained("roberta-large", use_fast=True)

model = AutoModelForMultipleChoice.from_pretrained("Qubix/RoBERTa-PollBERT350K_full_model")

In [245]:
def predict(x):
    input = f'{x["demographic"]} : {x["title"]}'
    labels = [l for l in x["label"] if l != -1]
    choices = []
    question = []
    for i in range(len(labels)):
        index = f'choice_'+str(i)
        choices.append(x[index])
        question.append(input)
    encoding = tokenizer(question, choices, return_tensors="pt", padding=True)
    outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=torch.tensor(labels).unsqueeze(0))
    return x



In [121]:
question = "democrat: Do You Believe Your Congressional District is gerrymandered?"
choice1 = "yes"
choice2 = "no"
choice3 = ""
encoding = tokenizer([question,question], [choice1,choice2], return_tensors="pt", padding=True)
outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()})
outputs.logits.softmax(dim=-1).tolist()

[[0.36183103919029236, 0.6381690502166748]]

In [246]:
df_test_predict = df_reddit_demographics_normalized_labels[df_reddit_demographics_normalized_labels["split"] == "train"].progress_apply(predict,axis=1)

  0%|          | 0/11037 [00:00<?, ?it/s]

In [249]:
df_test_predict["bce_loss"].sum()/len(df_test_predict)

tensor(0.6813)

### WVMCQ7

### Reddit

### WVMCQ7