# Sampling statements on common sense platform

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy
import json
import polars as pl
import os
from tqdm import tqdm
import ast

import matplotlib

# Set default font to Arial
matplotlib.rcParams["font.family"] = "Arial"
matplotlib.rcParams["font.sans-serif"] = "Arial"

## Load statements, ratings and human data

In [2]:
# All answers
print("=" * 80)
print("LOADING ANSWERS\n")
base_path = "../answers"

# Get the name of all files in the directory
files = filter(lambda s: s.endswith(".csv"), sorted(os.listdir(base_path)))
files = list(map(lambda s: os.path.join(base_path, s), files))
print("Information about survey answers are in the following files:")
for file in files:
    print("  -", file)

df_answers = pd.concat([pd.read_csv(f) for f in files])
df_answers["createdAt"] = pd.to_datetime(df_answers["createdAt"])

print("\nSummary")
print(f"- Raw number of answers: {df_answers.shape[0]:,}")
print(f"- Number of unique session IDs: {df_answers['sessionId'].unique().shape[0]:,}")

LOADING ANSWERS

Information about survey answers are in the following files:
  - ../answers/answers_1.csv
  - ../answers/answers_2.csv

Summary
- Raw number of answers: 798,957
- Number of unique session IDs: 71,082


In [3]:
print("=" * 80)
print("LOADING STATEMENTS\n")

df_statements = pl.read_csv("../statements/statements_1.csv")

print("Number of statements:", df_statements.shape[0])
print("Examples:")
df_statements.head(5)

LOADING STATEMENTS

Number of statements: 10110
Examples:


id,statement,statementSource,origLanguage,published,statementMedian,createdAt,updatedAt,statementCategory,parentId,statement_zh,statement_ru,statement_pt,statement_ja,statement_hi,statement_fr,statement_es,statement_bn,statement_ar
i64,str,str,str,i64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""1 plus 1 is 2""","""category response""","""En""",0,1.0,"""2023-04-08 12:21:33""","""2023-04-08 12:21:33""","""Mathematics and logic""",,"""1 加 1 等于 2。""","""1 плюс 1 равно 2.""","""1 mais 1 é igual a 2.""","""1 足す 1 は 2 です。""","""1 प्लस 1 2 के बराबर होता है।""","""1 plus 1 est égal à 2.""","""1 más 1 es igual a 2.""","""1 প্লাস 1 সমান 2।""","""1 زائد 1 يساوي 2."""
2,"""5 is alot bigger than 1""","""category response""","""En""",0,1.0,"""2023-04-08 12:21:33""","""2023-04-08 12:21:33""","""Mathematics and logic""",,"""5 明显大于 1。""","""5 значительно больше 1.""","""5 é significativamente maior d…","""5 は 1 よりもかなり大きいです。""","""5, 1 से काफी बड़ा है।""","""5 est nettement plus grand que…","""5 es significativamente mayor …","""5 1 এর চেয়ে উল্লেখযোগ্যভাবে ব…","""5 أكبر بكثير من 1."""
3,"""a balanced diet and regular ex…","""category response""","""En""",0,1.0,"""2023-04-08 12:21:33""","""2023-04-08 12:21:33""","""Health and fitness""",,"""为了保持健康，需要均衡饮食和定期运动。""","""Для поддержания здоровья необх…","""Para manter uma boa saúde, é p…","""健康を維持するには、バランスの取れた食事と定期的な運動が必要…","""अच्छे स्वास्थ्य को बनाए रखने क…","""Pour rester en bonne santé, il…","""Para mantener una buena salud,…","""ভাল স্বাস্থ্য বজায় রাখতে, একজ…","""للحفاظ على صحة جيدة، يحتاج الم…"
4,"""a ball is round""","""Concept Net""","""En""",0,1.0,"""2023-04-08 12:21:33""","""2023-04-08 12:21:33""","""Natural and physical sciences""",,"""球是圆的。""","""Мяч круглый.""","""Uma bola é redonda.""","""ボールは丸い。""","""एक गेंद गोल होती है।""","""Une balle est ronde.""","""La pelota es redonda.""","""একটি বল বৃত্তাকার।""","""الكرة مستديرة."""
5,"""a baton twirler doesn't want a…","""Concept Net""","""En""",0,1.0,"""2023-04-08 12:21:33""","""2023-04-08 12:21:33""","""Human activities""",,"""旋转警棍的人不想手指骨折。""","""Вертящий дубинку не захочет сл…","""Um girador de bastões não gost…","""バトントワラーなら指を骨折したくはないでしょう。""","""एक बैटन ट्विरलर एक टूटी हुई उं…","""Un joueur de matraque ne voudr…","""Un tirador de bastones no quer…","""একজন ব্যাটন টুয়ারলার ভাঙা আঙু…","""لن يرغب جهاز تدوير العصا في أن…"


In [4]:
# Individual data
print("=" * 80)
print("LOADING INDIVIDUALS' DATA\n")
base_path = "../individuals"

# Get the name of all files in the directory
files = filter(lambda s: s.endswith(".csv"), sorted(os.listdir(base_path)))
files = list(map(lambda s: os.path.join(base_path, s), files))
print("Information about survey respondents are in the following files:")
for file in files:
    print("  -", file)

df_individuals = pd.concat([pd.read_csv(f) for f in files])
df_individuals["createdAt"] = pd.to_datetime(df_individuals["createdAt"])

print("\nSummary")
print(f"- Raw number of rows: {df_individuals.shape[0]:,}")
print("\nNumber of unique session IDs")
print(
    f" - CRT: {df_individuals[df_individuals['informationType'] == 'CRT'].shape[0]:,}"
)
print(
    f" - RME: {df_individuals[df_individuals['informationType'] == 'rmeTen'].shape[0]:,}"
)
print(
    f" - Demo: {df_individuals[(df_individuals['informationType'] == 'demographics')
                               | (df_individuals['informationType'] == 'demographicsLongInternational')].shape[0]:,}"
)

LOADING INDIVIDUALS' DATA

Information about survey respondents are in the following files:
  - ../individuals/individuals_1.csv
  - ../individuals/individuals_2.csv

Summary
- Raw number of rows: 126,107

Number of unique session IDs
 - CRT: 43,549
 - RME: 41,658
 - Demo: 40,900


In [6]:
df_individuals_crt = df_individuals[df_individuals["informationType"] == "CRT"].copy(
    deep=True
)
df_individuals_rme = df_individuals[df_individuals["informationType"] == "rmeTen"].copy(
    deep=True
)
df_individuals_demo = df_individuals[
    (df_individuals["informationType"] == "demographics")
    | (df_individuals["informationType"] == "demographicsLongInternational")
].copy(deep=True)

In [5]:
triplets_plus_answers_path = "../demo_matches/triplet_results_plus_answer.csv"
matched_triplets = pd.read_csv(triplets_plus_answers_path)
matched_triplets = matched_triplets[["answers", "crt", "rme", "demo"]].set_index("answers")
matched_triplets.head(5)

Unnamed: 0_level_0,crt,rme,demo
answers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFKsbOAjFdyRPaipeIWVAPIq_PorPuRV,PFbkwQgyw7bJMoznb3B5mVwzgoQ3XlTa,HsNOfJ9KgLpMskaX0Qj38MndXRn8h7jP,lVy_LGMJny22Nheyn8Kjxea3ryzfjNOb
5pYTxD4k98-hz59OXFd3rKZViR4XFE2g,Q_gANCBosmXXUloBlgaXL-AsPnycLaz7,zG14BGZbKOw0i34YTgkagT-2PRrGgFXh,UrLPQomWxyLG-cW2pM3Oob6L2Qwtl1sz
6oXWHWLsTPuZGkHld6P5c38e2MRa7E_k,Xnuej7VtARxsjnROIMvpL1RijdftyVt0,4se5Vw_T7lnY7hNrDX0m8fhCT9-L9FOn,a-98rOxglX3JG6anZcIgKpnfcyCIxcR3
M4QNHDNQgCuwJykO9rkVwjv7g4WH0of9,HdZtfJKkbGcyVE0EWP_ZI_71I5L5UsuC,yiSGKEjfnEx54tfWB98e4mhIOtxX2e8R,zH8Bjr8lCUJNGY5pVFdc_E_QMlbQYh2g
SNrx_UHsLbFssRvQJlQz9F-hxFuAWuyB,wHbB6GwzoF4hnwla39SiPC_s8sPql_xe,TYX1jHdP-3PfW1BB6CjybhmaEko7WiKF,DgvR0M12UketmCL-D4T5yw4MBcr9XVQB


Get country for each answer session.

In [19]:
df_individuals_demo["country_reside"] = df_individuals_demo["experimentInfo"]\
    .map(lambda s: json.loads(s)["responses"]["country_reside"])

In [24]:
demo_to_country = df_individuals_demo[["userSessionId", "country_reside"]
                                      ].drop_duplicates().set_index("userSessionId").dropna()

In [25]:
demo_to_country

Unnamed: 0_level_0,country_reside
userSessionId,Unnamed: 1_level_1
FkNLAgvZVY9w_7ZdMSlLg7FGlbutttf2,United States
l4JyCjo2Mq0huM9-Z78JRvxO3bERBKFH,Bahamas
ZT_ckkMnCIWKaeFQtEjEdiIpFjx2xki3,United States
VPgzPGNjKA4_OQ5VJvvEKyeoAe2DpZM0,Bahamas
rv0R1OCQRzmAUCRLNO4_R8NaT1met4S1,United States
...,...
09dB2c_qQCEkBgvI-8oByiRVATCch2me,United States
Ajm3mtWnCgeiWTfPdm8nUMy85wkWpyIH,United Kingdom
ywa6saqJt3H4RAN-qWkFt5t3nRa59PMm,Australia
4rD_z06rLonM8ddg410gZZhwZGUppIjb,United States


Only keep ratings from users whose country of residence we know.

In [33]:
triplets_with_country = pd.merge(matched_triplets, demo_to_country, left_on="demo", right_index=True)

In [35]:
triplets_with_country

Unnamed: 0_level_0,crt,rme,demo,country_reside
answers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFKsbOAjFdyRPaipeIWVAPIq_PorPuRV,PFbkwQgyw7bJMoznb3B5mVwzgoQ3XlTa,HsNOfJ9KgLpMskaX0Qj38MndXRn8h7jP,lVy_LGMJny22Nheyn8Kjxea3ryzfjNOb,Canada
5pYTxD4k98-hz59OXFd3rKZViR4XFE2g,Q_gANCBosmXXUloBlgaXL-AsPnycLaz7,zG14BGZbKOw0i34YTgkagT-2PRrGgFXh,UrLPQomWxyLG-cW2pM3Oob6L2Qwtl1sz,United Kingdom
6oXWHWLsTPuZGkHld6P5c38e2MRa7E_k,Xnuej7VtARxsjnROIMvpL1RijdftyVt0,4se5Vw_T7lnY7hNrDX0m8fhCT9-L9FOn,a-98rOxglX3JG6anZcIgKpnfcyCIxcR3,United Kingdom
M4QNHDNQgCuwJykO9rkVwjv7g4WH0of9,HdZtfJKkbGcyVE0EWP_ZI_71I5L5UsuC,yiSGKEjfnEx54tfWB98e4mhIOtxX2e8R,zH8Bjr8lCUJNGY5pVFdc_E_QMlbQYh2g,Brazil
SNrx_UHsLbFssRvQJlQz9F-hxFuAWuyB,wHbB6GwzoF4hnwla39SiPC_s8sPql_xe,TYX1jHdP-3PfW1BB6CjybhmaEko7WiKF,DgvR0M12UketmCL-D4T5yw4MBcr9XVQB,United Kingdom
...,...,...,...,...
lqFOduZ3XXIjWn62ADVLLNrgjGuaw8sy,lqFOduZ3XXIjWn62ADVLLNrgjGuaw8sy,lqFOduZ3XXIjWn62ADVLLNrgjGuaw8sy,lqFOduZ3XXIjWn62ADVLLNrgjGuaw8sy,United States
k0Iws7FihONCsSg5ih87GpDtz7_a1thp,k0Iws7FihONCsSg5ih87GpDtz7_a1thp,k0Iws7FihONCsSg5ih87GpDtz7_a1thp,k0Iws7FihONCsSg5ih87GpDtz7_a1thp,Australia
nxa-71HdGqtWVVr5tgtx4zgOy00wf-lJ,nxa-71HdGqtWVVr5tgtx4zgOy00wf-lJ,nxa-71HdGqtWVVr5tgtx4zgOy00wf-lJ,nxa-71HdGqtWVVr5tgtx4zgOy00wf-lJ,United Kingdom
F3kO3xCeCFHaYHxQK7fdVlPwt_PzX4vm,F3kO3xCeCFHaYHxQK7fdVlPwt_PzX4vm,F3kO3xCeCFHaYHxQK7fdVlPwt_PzX4vm,F3kO3xCeCFHaYHxQK7fdVlPwt_PzX4vm,New Zealand


In [45]:
answers_with_country = pd.merge(
    df_answers,
    triplets_with_country,
    left_on="sessionId",
    right_index=True,
    how="inner"
)

In [47]:
answers_with_country

Unnamed: 0,id,I_agree,I_agree_reason,others_agree,others_agree_reason,perceived_commonsense,clarity,origLanguage,sessionId,createdAt,updatedAt,statement_number,statementId,clientVersion,crt,rme,demo,country_reside
3375,3376,1,It's obvious,1,I think most people have good judgement with r...,1,,en,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,2023-07-18 09:33:03,2023-07-18 09:33:03,1653,1653,,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,Belgium
3376,3377,1,It's obvious,1,I think most people have good judgement with r...,1,,en,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,2023-07-18 09:33:11,2023-07-18 09:33:11,806,806,,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,Belgium
3377,3378,1,It's obvious,1,I don't know,0,,en,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,2023-07-18 09:33:50,2023-07-18 09:33:50,4161,4161,,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,Belgium
3378,3379,1,It's obvious,1,I think most people have good judgement with r...,0,,en,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,2023-07-18 09:33:55,2023-07-18 09:33:55,831,831,,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,Belgium
3379,3380,1,It's obvious,1,I think most people have good judgement with r...,1,,en,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,2023-07-18 09:34:42,2023-07-18 09:34:42,3794,3794,,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,nT7bs-XHJvtATColxhCUmKidpQZk0Ur0,Belgium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307897,798721,1,It's my opinion,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:13,2025-08-22 21:37:13,8995,8995,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307898,798722,1,It's something I learned,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:28,2025-08-22 21:37:28,9401,9401,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307899,798723,1,It's my opinion,1,I think most people have good judgement with r...,1,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:54,2025-08-22 21:37:54,9658,9658,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307900,798724,1,It's something I learned,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:38:09,2025-08-22 21:38:09,9775,9775,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States


In [48]:
# Only statements ID from 8,860
answers_with_country = answers_with_country[answers_with_country["statementId"] >= 8860]
answers_with_country

Unnamed: 0,id,I_agree,I_agree_reason,others_agree,others_agree_reason,perceived_commonsense,clarity,origLanguage,sessionId,createdAt,updatedAt,statement_number,statementId,clientVersion,crt,rme,demo,country_reside
35660,526484,1,It's something I learned,1,I think it's mostly a matter of opinion,0,removed,en,KA2Ad54iudlO6D-ogNyv3QcTViQu2Jnt,2024-10-01 22:18:41,2024-10-01 22:18:41,8876,8876,c172239,WHbUXp5N_HhRR3Ra4xMvUaiY9qHlQdUc,D46OjVl0YKAMb48SrEDzlnPqlWiR2RWu,czI3MG4Sa5QkONunMXOl_jCMjEe9CpBa,United Kingdom
35662,526486,1,It's obvious,1,I think it's mostly a matter of opinion,1,removed,en,KA2Ad54iudlO6D-ogNyv3QcTViQu2Jnt,2024-10-01 22:19:04,2024-10-01 22:19:04,8882,8882,c172239,WHbUXp5N_HhRR3Ra4xMvUaiY9qHlQdUc,D46OjVl0YKAMb48SrEDzlnPqlWiR2RWu,czI3MG4Sa5QkONunMXOl_jCMjEe9CpBa,United Kingdom
35663,526487,1,It's my opinion,1,I think it's mostly a matter of opinion,0,removed,en,dvqgem-G1utYWer-GZ4tiOJo1kPX2WyL,2024-10-01 22:19:09,2024-10-01 22:19:09,8882,8882,c172239,7Osig_10-UMUBeejV_FfatZbrwj6MHyz,M0k-uq7HU46uI1y63mOVbNvBIqXieyNu,gxcR2Ob6mDBRbIoIMEUcsoqP29YKdla0,United States
35665,526489,1,It's something I learned,0,I think it's mostly a matter of opinion,0,removed,en,KA2Ad54iudlO6D-ogNyv3QcTViQu2Jnt,2024-10-01 22:19:23,2024-10-01 22:19:23,8885,8885,c172239,WHbUXp5N_HhRR3Ra4xMvUaiY9qHlQdUc,D46OjVl0YKAMb48SrEDzlnPqlWiR2RWu,czI3MG4Sa5QkONunMXOl_jCMjEe9CpBa,United Kingdom
35670,526494,0,It's my opinion,0,I think it's mostly a matter of opinion,0,removed,en,KA2Ad54iudlO6D-ogNyv3QcTViQu2Jnt,2024-10-01 22:19:41,2024-10-01 22:19:41,8875,8875,c172239,WHbUXp5N_HhRR3Ra4xMvUaiY9qHlQdUc,D46OjVl0YKAMb48SrEDzlnPqlWiR2RWu,czI3MG4Sa5QkONunMXOl_jCMjEe9CpBa,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307897,798721,1,It's my opinion,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:13,2025-08-22 21:37:13,8995,8995,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307898,798722,1,It's something I learned,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:28,2025-08-22 21:37:28,9401,9401,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307899,798723,1,It's my opinion,1,I think most people have good judgement with r...,1,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:37:54,2025-08-22 21:37:54,9658,9658,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States
307900,798724,1,It's something I learned,1,I think most people have good judgement with r...,0,,en,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,2025-08-22 21:38:09,2025-08-22 21:38:09,9775,9775,e9fc4f3,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,1sp423JP0jb-W6UcqrRQvNnK4YdB3jVe,United States


In [51]:
I_agree = answers_with_country.pivot_table(
    index="statementId",
    columns="country_reside",
    values="I_agree",
    aggfunc="count",
    fill_value=0
).astype(int)

In [53]:
I_agree.to_csv("../sampling_by_country/country_statement_rating_frequency.csv")