# Compare Arab Barometer Metadata Versions

In [1]:
import json
from pathlib import Path
import pandas as pd
from collections import defaultdict

In [2]:
# Load both metadata files
with open('metadata_arabbarometer_v1_AH.json', 'r') as f:
    metadata_ah = json.load(f)

with open('metadata_arabbarometer_v1_OC.json', 'r') as f:
    metadata_oc = json.load(f)

## 1. Top-level Structure Comparison

In [3]:
print("AH Top-level keys:")
print(list(metadata_ah.keys()))
print("\nOC Top-level keys:")
print(list(metadata_oc.keys()))

AH Top-level keys:
['SAMPLING VARIABLES', 'SECTION I: CORE DEMOGRAPHICS', 'SECTION II: STATE OF THE ECONOMY', 'SECTION III: TRUST & GOVERNMENT PERFORMANCE', 'SECTION IV:  ENGAGEMENT & GOVERNANCE PREFERENCES', 'SECTION V: MIGRATION & IMMIGRATION', 'SECTION VI: IDENTITY & RELIGIOUS PRACTICE', 'SECTION VII: CLIMATE CHANGE & THE ENVIRONMENT', 'SECTION VIII: GENDER NORMS & ATTITUDES', 'SECTION IX:  MEDIA', 'SECTION X: INTERNATIONAL RELATIONS', 'SECTION XI: COUNTRY -SPECIFIC QUESTIONS', 'SECTION XII: DEMOGRAPHICS']

OC Top-level keys:
['demographics', 'economic_views', 'trust_institutions', 'social_values', 'political_attitudes', 'international_relations', 'religion', 'climate_change', 'gender_norms', 'media']


## 2. Extract All Variable Names

In [4]:
def get_all_variables(metadata_dict):
    """Extract all variable names from metadata structure"""
    variables = {}
    for category, vars_dict in metadata_dict.items():
        if isinstance(vars_dict, dict):
            for var_name in vars_dict.keys():
                if var_name not in variables:
                    variables[var_name] = []
                variables[var_name].append(category)
    return variables

ah_variables = get_all_variables(metadata_ah)
oc_variables = get_all_variables(metadata_oc)

print(f"Total variables in AH: {len(ah_variables)}")
print(f"Total variables in OC: {len(oc_variables)}")

Total variables in AH: 406
Total variables in OC: 690


## 3. Variable Comparison

In [5]:
# Variables in both files
common_variables = set(ah_variables.keys()) & set(oc_variables.keys())
print(f"Variables in both files: {len(common_variables)}")

# Variables only in AH
ah_only = set(ah_variables.keys()) - set(oc_variables.keys())
print(f"Variables only in AH: {len(ah_only)}")

# Variables only in OC
oc_only = set(oc_variables.keys()) - set(ah_variables.keys())
print(f"Variables only in OC: {len(oc_only)}")

Variables in both files: 392
Variables only in AH: 14
Variables only in OC: 298


In [6]:
# Show variables only in AH
print("Variables only in AH:")
for var in sorted(ah_only):
    print(f"  {var}: {ah_variables[var]}")

Variables only in AH:
  Q1012C_MOR: ['SECTION VI: IDENTITY & RELIGIOUS PRACTICE']
  Q1034: ['SECTION XII: DEMOGRAPHICS']
  Q104A_2: ['SECTION V: MIGRATION & IMMIGRATION']
  Q104B: ['SECTION V: MIGRATION & IMMIGRATION']
  Q130: ['SECTION II: STATE OF THE ECONOMY']
  Q201A_41_Gaza: ['SECTION III: TRUST & GOVERNMENT PERFORMANCE']
  Q412A: ['SECTION IX:  MEDIA']
  Q432: ['SECTION IX:  MEDIA']
  Q622C_IRQ: ['SECTION VIII: GENDER NORMS & ATTITUDES']
  Q622E_IRQ: ['SECTION VIII: GENDER NORMS & ATTITUDES']
  Q629: ['SECTION VIII: GENDER NORMS & ATTITUDES']
  QKUW34: ['SECTION XI: COUNTRY -SPECIFIC QUESTIONS']
  QKUW40: ['SECTION XI: COUNTRY -SPECIFIC QUESTIONS']
  QMOR7: ['SECTION XI: COUNTRY -SPECIFIC QUESTIONS']


In [7]:
# Show variables only in OC
print("Variables only in OC:")
for var in sorted(oc_only):
    print(f"  {var}: {oc_variables[var]}")

Variables only in OC:
  BLOCK_ID: ['demographics']
  COUNTRY: ['demographics']
  Q1012C_MOR_1: ['religion']
  Q1012C_MOR_2: ['demographics']
  Q1012C_MOR_3: ['demographics']
  Q1012C_MOR_4: ['demographics']
  Q1012C_MOR_90: ['demographics']
  Q1012C_MOR_98: ['demographics']
  Q1012C_MOR_99: ['demographics']
  Q1014A: ['demographics']
  Q1015: ['economic_views']
  Q1034_1: ['economic_views']
  Q1034_2: ['economic_views']
  Q1034_3: ['economic_views']
  Q1034_4: ['economic_views']
  Q1034_5: ['economic_views']
  Q1034_6: ['economic_views']
  Q1034_7: ['economic_views']
  Q1034_8: ['economic_views']
  Q1034_9: ['economic_views']
  Q1034_98: ['economic_views']
  Q1034_99: ['economic_views']
  Q104A_2_1: ['international_relations']
  Q104A_2_2: ['international_relations']
  Q104A_2_3: ['international_relations']
  Q104A_2_4: ['international_relations']
  Q104A_2_5: ['international_relations']
  Q104A_2_6: ['international_relations']
  Q104A_2_7: ['international_relations']
  Q104A_2_90: ['i

In [15]:
# split variable names Q1012C_MOR_1 -> Q1012C_MOR
def split_variable_name(var_name):
    parts = var_name.rsplit('_', 1)
    if len(parts) == 2 and parts[1].isdigit():
        return parts[0]
    return var_name

split_variable_names_oc = {split_variable_name(var) for var in oc_only}
split_variable_names_oc

{'BLOCK_ID',
 'COUNTRY',
 'Q1012C_MOR',
 'Q1014A',
 'Q1015',
 'Q1034',
 'Q104A_2',
 'Q104B',
 'Q130',
 'Q201A_41_GAZA',
 'Q412A',
 'Q432',
 'Q622C_IRQ',
 'Q622E_IRQ',
 'Q629',
 'Q867A',
 'Q867B',
 'Q872A',
 'Q872B',
 'Q872C',
 'Q872C_PAL',
 'Q873',
 'Q879',
 'Q881A',
 'Q881B',
 'Q882',
 'Q883',
 'Q884A',
 'Q884B',
 'Q885',
 'Q886',
 'QGAZA1',
 'QGAZA2',
 'QGAZA3A',
 'QGAZA3B',
 'QGAZA4A',
 'QGAZA4B',
 'QGAZA5A',
 'QGAZA5A2',
 'QGAZA5B',
 'QGAZA5C',
 'QGAZA6',
 'QKUW34',
 'QKUW40',
 'QMOR7',
 'STRATUM',
 'WT'}

In [17]:
# search for variable names split_variable_names_oc that don't exist in ah_variables
split_variable_names_ah = {split_variable_name(var) for var in ah_only}
split_variable_names_ah_missing = split_variable_names_oc - split_variable_names_ah
split_variable_names_ah_missing

{'BLOCK_ID',
 'COUNTRY',
 'Q1014A',
 'Q1015',
 'Q104A_2',
 'Q201A_41_GAZA',
 'Q867A',
 'Q867B',
 'Q872A',
 'Q872B',
 'Q872C',
 'Q872C_PAL',
 'Q873',
 'Q879',
 'Q881A',
 'Q881B',
 'Q882',
 'Q883',
 'Q884A',
 'Q884B',
 'Q885',
 'Q886',
 'QGAZA1',
 'QGAZA2',
 'QGAZA3A',
 'QGAZA3B',
 'QGAZA4A',
 'QGAZA4B',
 'QGAZA5A',
 'QGAZA5A2',
 'QGAZA5B',
 'QGAZA5C',
 'QGAZA6',
 'STRATUM',
 'WT'}

## 4. Changes made to add missiong variables from OC to v2_AH

In [None]:
# {'BLOCK_ID', -> no question
#  'COUNTRY', -> no question
#  'Q1014A', -> added question Q1014A to v2_AH
#  'Q1015', -> added question Q1015 to v2_AH
#  'Q104A_2', -> added question Q104A_1 to v2_AH (open-response question)
#  'Q201A_41_GAZA', -> already in v1_AH, (only with different casing)
#  'Q867A', -> added question Q867A to v2_AH
#  'Q867B', -> added question Q867B to v2_AH
#  'Q872A', -> added question Q872A to v2_AH
#  'Q872B', -> added question Q872B to v2_AH
#  'Q872C', -> added question Q872C to v2_AH
#  'Q872C_PAL', -> added question Q872C_PAL to v2_AH
#  'Q873', -> added question Q873 to v2_AH (+ rephrasing of question)
#  'Q879', -> added question Q879 to v2_AH
#  'Q881A', -> added question Q881A to v2_AH (+ rephrasing of question)
#  'Q881B', -> added question Q881B to v2_AH (+ rephrasing of question)
#  'Q882', -> added question Q882 to v2_AH
#  'Q883', -> added question Q883 to v2_AH
#  'Q884A', -> added question Q884A to v2_AH
#  'Q884B', -> added question Q884B to v2_AH (+ rephrasing of question)
#  'Q885', -> added question Q885 to v2_AH
#  'Q886', -> added question Q886 to v2_AH
#  'QGAZA1', -> added question QGAZA1 (merged to multi-select) to v2_AH
#  'QGAZA2', -> added question QGAZA2 to v2_AH
#  'QGAZA3A', -> added question QGAZA3A to v2_AH
#  'QGAZA3B', -> added question QGAZA3B to v2_AH
#  'QGAZA4A', -> added question QGAZA4A to v2_AH
#  'QGAZA4B', -> added question QGAZA4B to v2_AH
#  'QGAZA5A', -> added question QGAZA5A to v2_AH
#  'QGAZA5A2', -> added question QGAZA5A2 to v2_AH
#  'QGAZA5B', -> added question QGAZA5B to v2_AH
#  'QGAZA5C', -> added question QGAZA5C to v2_AH
#  'QGAZA6', -> added question QGAZA6 to v2_AH
#  'STRATUM', -> no question
#  'WT'} -> no question