# Extract Arab Barometer Metadata

## 1. Setup and Dependencies

In [1]:
import os
import json
from pathlib import Path
from typing import Dict, Optional
from openai import AzureOpenAI
import PyPDF2
from dotenv import load_dotenv
from pydantic import BaseModel, Field

load_dotenv()

True

## 2. Configure Azure OpenAI

In [2]:
# Azure OpenAI configuration
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

## 3. Load PDF Content

In [None]:
# Path to the Arab Barometer PDF
PDF_PATH = Path("data/ENG-Arab-Barometer-Wave-VIII-Questionnaire-RELEASE-FIN-NOV-2024-1.pdf") # TODO: update path

def extract_text_from_pdf(pdf_path, start_page=1):
    """Extract text content from a PDF file starting from a specific page.

    Args:
        pdf_path: Path to the PDF file
        start_page: Page number to start extraction from (1-indexed, default=1)
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        total_pages = len(pdf_reader.pages)
        pages_to_extract = total_pages - start_page + 1
        print(f"Total pages: {total_pages}")
        print(f"Starting extraction from page {start_page} (extracting {pages_to_extract} pages)")

        for page_num in range(start_page - 1, total_pages):
            text += f"\n--- Page {page_num + 1} ---\n"
            text += pdf_reader.pages[page_num].extract_text()

    return text

pdf_text = extract_text_from_pdf(PDF_PATH)
print(f"\nExtracted {len(pdf_text)} characters from PDF")
print(f"\nFirst 500 characters:\n{pdf_text[:500]}")

Total pages: 93
Starting extraction from page 1 (extracting 93 pages)

Extracted 143781 characters from PDF

First 500 characters:

--- Page 1 ---
2023-2024Arab Barometer Wa ve VIII
Questionnaire
--- Page 2 ---
  
 
Arab Barometer Wave VIII Source Questionnaire  
 
September 2024  
 
 
TABLE OF CONTENTS  
 
SAMPLING VARIABLES  4 
SECTION I  CORE DEMOGRAPHICS  5 
SECTION II  STATE OF THE ECONOMY  6 
SECTION III  TRUST & GOVERNMENT PERFORMANCE  11 
SECTION IV  ENGAGEMENT & GOVERNANCE PREFERENCES  17 
SECTION V  MIGRATION & IMMIGRATION  27 
SECTION VI  IDENTITY & RELIGIOUS PRACTICE  31 
SECTION VII  CLIMATE CHANGE & THE ENVIRO


## 4. Define Extraction Prompt

In [5]:
EXTRACTION_PROMPT = """Task: Convert survey questionnaire data to structured JSON format

Parse survey question documentation and convert each question following these rules:

Parsing rules:

1. Section (key): Use the exact section name as top-level key (e.g., "CORE DEMOGRAPHICS", "POLITICAL ATTITUDES")
2. Variable name (nested key): Use the exact variable name as provided (e.g., "Q100", "SEX2", "pxmltn")

3. Description: 
   - If a variable label is provided, extract it (remove any variable name prefix)
   - If not provided, infer a brief, descriptive summary from the question (e.g., "respondent's gender", "political party preference")
   - Keep it concise, typically 2-4 words

4. Question wording:
   - Remove survey artifacts like "Looking at this card...", "I'm going to read...", interviewer instructions, etc.
   - Rephrase into natural, direct question format as if asked in person-to-person conversation
   - Example: "Looking at this card, tell me how you feel about..." → "How do you feel about..."
   - Document significant changes in the "notes" field

5. Values mapping:
   - Map all numeric/categorical codes to their corresponding text labels
   - Skip variables marked as "String variable" or open-ended text fields - do NOT include these in the output
   - Preserve special codes (e.g., 9995=Other, 9999=Don't know, -1=Missing)

6. Likert scales with incomplete labels:
   - If scale has only endpoint labels (e.g., 0="Strongly oppose", 10="Strongly support"), collapse to 5-point scale
   - Assign neighboring values the same label to create natural gradations
   - Example for 0-10 scale:
     • 0-1 → "Strongly oppose"
     • 2-3 → "Somewhat oppose"  
     • 4-6 → "Neither oppose nor support"
     • 7-8 → "Somewhat support"
     • 9-10 → "Strongly support"
   - Document the collapse in "notes": "Collapsed from 11-point to 5-point scale"

7. Notes field (include only when applicable):
   - Significant rewording of question text
   - Scale collapses or category modifications
   - Any other transformations made for clarity

The output format should be:
{
  "SECTION NAME 1": {
    "VARIABLE_1": {
      "description": "...",
      "question": "...",
      "values": {...}
    },
    "VARIABLE_2": {
      "description": "...",
      "question": "...",
      "values": {...},
      "notes": "..." 
    }
  },
  "SECTION NAME 2": {
    ...
  }
}

---

Example output structure:

{
  "CORE DEMOGRAPHICS": {
    "AGE": {
      "description": "Respondent's age",
      "question": "How old are you?",
      "values": {
        "998": "Refused",
        "999": "Don't know",
        "-1": "Missing"
      }
    }
  },
  "POLITICAL ATTITUDES": {
    "pxmltn": {
      "description": "Political party identification",
      "question": "Which political party do you identify with most?",
      "values": {
        "1": "Democrat",
        "2": "Republican",
        "3": "Independent",
        "4": "Other party",
        "5": "No party",
        "99": "Refused"
      },
      "notes": "Removed survey artifact: 'Looking at this card'"
    },
    "Q_112": {
      "description": "Trust in government",
      "question": "How much do you trust the government?",
      "values": {
        "0": "Do not trust at all",
        "1": "Do not trust at all",
        "2": "Slightly trust",
        "3": "Slightly trust",
        "4": "Somewhat trust",
        "5": "Somewhat trust",
        "6": "Somewhat trust",
        "7": "Mostly trust",
        "8": "Mostly trust",
        "9": "Trust completely",
        "10": "Trust completely"
      },
      "notes": "Collapsed 11-point scale to 5 meaningful categories for natural language labeling"
    }
  }
}

---

Now, parse the following survey questionnaire content:"""

## 5. Extract Metadata

In [6]:
# function with chunking

def extract_metadata_with_chunking(text, chunk_size=50000):
    """
    Extract metadata from PDF text using chunking strategy.
    
    Args:
        text: The full PDF text to process
        chunk_size: Number of characters per chunk (default: 50000)
    
    Returns:
        Dictionary containing all extracted metadata merged from chunks
    """
    print(f"Text length: {len(text):,} characters")
    print(f"Chunk size: {chunk_size:,} characters")
    
    # Split text into chunks based on page markers to avoid breaking questions
    pages = text.split('\n--- Page ')
    
    chunks = []
    current_chunk = ""
    
    for page in pages:
        page_text = '\n--- Page ' + page if page != pages[0] else page
        
        # If adding this page exceeds chunk size, save current chunk and start new one
        if len(current_chunk) + len(page_text) > chunk_size and current_chunk:
            chunks.append(current_chunk)
            current_chunk = page_text
        else:
            current_chunk += page_text
    
    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk)
    
    print(f"Split into {len(chunks)} chunks")
    
    # Process each chunk
    all_metadata = {}
    
    for i, chunk in enumerate(chunks):
        print(f"\nProcessing chunk {i+1}/{len(chunks)} ({len(chunk):,} characters)...")
        
        messages = [
            {
                "role": "system",
                "content": "You are a survey data parsing assistant. Extract survey questions and convert them to structured JSON format following the provided rules exactly."
            },
            {
                "role": "user",
                "content": f"{EXTRACTION_PROMPT}\n\n{chunk}"
            }
        ]
        
        try:
            completion = client.chat.completions.create(
                model=AZURE_OPENAI_DEPLOYMENT,
                messages=messages,
                temperature=0.1,
                response_format={"type": "json_object"}
            )
            
            response_content = completion.choices[0].message.content
            chunk_metadata = json.loads(response_content)
            
            # Merge chunk metadata into all_metadata
            for section_name, section_data in chunk_metadata.items():
                if section_name not in all_metadata:
                    all_metadata[section_name] = {}
                
                if isinstance(section_data, dict):
                    all_metadata[section_name].update(section_data)
            
            # Count variables in this chunk
            chunk_var_count = sum(len(v) for v in chunk_metadata.values() if isinstance(v, dict))
            print(f"  Extracted {len(chunk_metadata)} sections with {chunk_var_count} variables")
            
        except Exception as e:
            print(f"  Error processing chunk {i+1}: {e}")
            import traceback
            traceback.print_exc()
    
    # Final statistics
    total_sections = len(all_metadata)
    total_variables = sum(len(section) for section in all_metadata.values() if isinstance(section, dict))
    
    print(f"Chunked extraction complete!")
    print(f"Total sections: {total_sections}")
    print(f"Total variables: {total_variables}")
    
    return all_metadata

# all_metadata_chunked = extract_metadata_with_chunking(pdf_text, chunk_size=50000)

In [None]:
# function without chunking

def extract_metadata_from_text(text):
    """Extract metadata from the entire PDF text using Azure OpenAI."""
    print("Processing entire PDF in one request...")
    
    messages = [
        {
            "role": "system",
            "content": "You are a survey data parsing assistant. Extract survey questions and convert them to structured JSON format following the provided rules exactly."
        },
        {
            "role": "user",
            "content": f"{EXTRACTION_PROMPT}\n\n{text}"
        }
    ]
    
    try:
        completion = client.chat.completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            messages=messages,
            temperature=0.1,
            response_format={"type": "json_object"}
        )
        
        # Parse the JSON response
        response_content = completion.choices[0].message.content
        print(f"\nResponse preview (first 1000 chars):\n{response_content[:1000]}\n")
        
        metadata_dict = json.loads(response_content)
        print(f"Top-level keys (sections) in response: {list(metadata_dict.keys())[:10]}")
        
        # Count total variables across all sections
        total_vars = 0
        for section_name, section_data in metadata_dict.items():
            if isinstance(section_data, dict):
                total_vars += len(section_data)
        
        print(f"Successfully parsed {len(metadata_dict)} sections with {total_vars} total variables")
        
        # Show sample section
        if metadata_dict:
            first_section = list(metadata_dict.keys())[0]
            first_section_vars = list(metadata_dict[first_section].keys())[:3]
            print(f"Sample section '{first_section}' has variables: {first_section_vars}")
        
        return metadata_dict
    
    except Exception as e:
        print(f"Error processing PDF: {e}")
        import traceback
        traceback.print_exc()
        return {}

# Process the entire PDF
all_metadata = extract_metadata_from_text(pdf_text)

print(f"Extraction complete!")
print(f"Total sections: {len(all_metadata)}")
total_variables = sum(len(section) for section in all_metadata.values() if isinstance(section, dict))
print(f"Total variables: {total_variables}")

## 6. Preview Extracted Metadata

In [94]:
# Display first few variables
print("Sample of extracted metadata:\n")
for i, (var_name, var_data) in enumerate(list(all_metadata.items())[:3]):
    print(f"Variable: {var_name}")
    print(json.dumps(var_data, indent=2))
    print("\n" + "-"*50 + "\n")
    if i >= 2:
        break

Sample of extracted metadata:

Variable: SAMPLING VARIABLES
{
  "ID": {
    "description": "Respondent ID number",
    "question": "What is the respondent ID number?",
    "values": {}
  },
  "DATE": {
    "description": "Interview date",
    "question": "What is today\u2019s date?",
    "values": {}
  },
  "PSU": {
    "description": "Primary sampling unit",
    "question": "What is the primary sampling unit number?",
    "values": {}
  },
  "Q1": {
    "description": "Governorate",
    "question": "In which governorate do you live?",
    "values": {
      "1": "Country-specific governorate list",
      "98": "Don\u2019t know",
      "99": "Refused to answer"
    }
  },
  "Q1A_PAL": {
    "description": "Palestine area",
    "question": "Do you live in the West Bank or Gaza?",
    "values": {
      "1": "West Bank",
      "2": "Gaza"
    }
  },
  "Q13": {
    "description": "Urban or rural residence",
    "question": "Is the area where you live urban, rural, or a camp?",
    "values":

## 7. Save Metadata to JSON File

In [None]:
# Save to JSON file
output_path = Path("pulled_metadata_arabbarometer.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_metadata, f, indent=2, ensure_ascii=False)

print(f"Metadata saved to: {output_path}")

## 8. Validation

In [7]:
# read the saved JSON file and verify
with open("pulled_metadata_arabbarometer.json", 'r', encoding='utf-8') as f:
    loaded_metadata = json.load(f)

In [8]:
# view top-level keys section names
section_names = list(loaded_metadata.keys())
print(f"Sections in loaded metadata: {section_names}")

Sections in loaded metadata: ['SAMPLING VARIABLES', 'SECTION I: CORE DEMOGRAPHICS', 'SECTION II: STATE OF THE ECONOMY', 'SECTION III: TRUST & GOVERNMENT PERFORMANCE', 'SECTION IV:  ENGAGEMENT & GOVERNANCE PREFERENCES', 'SECTION V: MIGRATION & IMMIGRATION', 'SECTION VI: IDENTITY & RELIGIOUS PRACTICE', 'SECTION VII: CLIMATE CHANGE & THE ENVIRONMENT', 'SECTION VIII: GENDER NORMS & ATTITUDES', 'SECTION IX:  MEDIA', 'SECTION X: INTERNATIONAL RELATIONS', 'SECTION XI: COUNTRY -SPECIFIC QUESTIONS', 'SECTION XII: DEMOGRAPHICS']


In [9]:
# view variable names per section
section_variable_names = {}
for section, variables in loaded_metadata.items():
    variable_names = list(variables.keys())
    section_variable_names[section] = variable_names
    print(f"Section '{section}' has variables: {variable_names[:5]}...")

section_variable_names

Section 'SAMPLING VARIABLES' has variables: ['ID', 'DATE', 'PSU', 'Q1', 'Q1A_PAL']...
Section 'SECTION I: CORE DEMOGRAPHICS' has variables: ['Q1001', 'Q1001YEAR', 'Q1001APPROX', 'Q1002', 'Q1014C']...
Section 'SECTION II: STATE OF THE ECONOMY' has variables: ['Q2061A', 'Q2061A_KUW', 'Q262', 'Q101', 'Q102']...
Section 'SECTION III: TRUST & GOVERNMENT PERFORMANCE' has variables: ['Q103', 'Q201A_1', 'Q201A_2', 'Q201A_3', 'Q201A_5']...
Section 'SECTION IV:  ENGAGEMENT & GOVERNANCE PREFERENCES' has variables: ['Q404', 'Q501D', 'Q501E_2', 'Q266', 'Q552_1A']...
Section 'SECTION V: MIGRATION & IMMIGRATION' has variables: ['Q104', 'Q104B_TUN', 'Q104A_2', 'Q104B', 'Q104C']...
Section 'SECTION VI: IDENTITY & RELIGIOUS PRACTICE' has variables: ['Q1012', 'Q1012A_MUSLIM', 'Q1012A_CHRISTIAN', 'Q1020JO2', 'Q1012B2']...
Section 'SECTION VII: CLIMATE CHANGE & THE ENVIRONMENT' has variables: ['Q540_1', 'Q540_2', 'Q540B_1', 'Q540B_2', 'Q540B_3']...
Section 'SECTION VIII: GENDER NORMS & ATTITUDES' has varia

{'SAMPLING VARIABLES': ['ID', 'DATE', 'PSU', 'Q1', 'Q1A_PAL', 'Q13'],
 'SECTION I: CORE DEMOGRAPHICS': ['Q1001',
  'Q1001YEAR',
  'Q1001APPROX',
  'Q1002',
  'Q1014C',
  'Q1014D',
  'Q1014E'],
 'SECTION II: STATE OF THE ECONOMY': ['Q2061A',
  'Q2061A_KUW',
  'Q262',
  'Q101',
  'Q102',
  'Q114',
  'Q127_1A',
  'Q127_1B',
  'Q127_2A',
  'Q127_2B',
  'Q112_2',
  'Q112B',
  'Q129A_1',
  'Q129A_2',
  'Q130',
  'Q118_1',
  'Q118_2',
  'Q123',
  'Q123B',
  'Q123_KUW'],
 'SECTION III: TRUST & GOVERNMENT PERFORMANCE': ['Q103',
  'Q201A_1',
  'Q201A_2',
  'Q201A_3',
  'Q201A_5',
  'Q201A_41',
  'Q201A_41_WB',
  'Q201A_41_Gaza',
  'Q201A_7',
  'Q201A_31B',
  'Q201A_31C',
  'Q201B_6',
  'Q201B_4',
  'Q201B_13',
  'Q201B_12',
  'Q201B_14',
  'Q201B_15',
  'Q105A',
  'Q277_1',
  'Q277_2',
  'Q204A_3',
  'Q204A_1',
  'Q204A_2',
  'Q204A_4',
  'Q204A_5',
  'Q204A_6',
  'Q204A_7',
  'Q204A_8',
  'Q204A_9',
  'Q204A_10',
  'Q204_2',
  'Q204_3',
  'Q204_11',
  'Q204_20',
  'Q273C',
  'Q273D',
  'Q273C_K