In [9]:
# !pip install pyreadstat

## START

I saved all the 'cleaned' up datasets as a csv to the folder clean_data for ease of viewing.

Note: For years 1985 1986 and 1988 the data explaining each variable is somewhat corrupted, although historically variable numbers are consistent, so we could use the variable names from previous or following years if necessary. For now I have just saved them without variable names.

For years 1992 to 1997, as well as 2002 onwards the dataset type changes from .tab to .por data. I have not yet figured out how to clean and read that, though the package pyreadstat is apparently able to open .por data. I suspect one of the authors of papers that makes use of this dataset might have a cleaner version of the dataset and/or an explanation for how they cleaned it, so I am going to email some of them. Most of these years also dont have a .dic file or a .txt file as in previous years containing all the variable names, questions, and corresponding answer scales. For years 2004 onwards the data changes again, and there are filetypes .sav, .dta, and .ovz that I have not figured out how to read either! I suspect one of them would contain the variable names but no clue which because I cannot open them!

In 2004 specifically (and I think one or two years after as well) they split the dataset into an a and b section:
P1692a: Respondents national sample
P1692b: Respondents disadvantaged neighbourhoods
So that's something to keep in mind as well.

Once all the data is cleaned and in one format I think a next step would be to add it all to one big file so we can see for what questions there is consistent year-by-year data.

In [10]:
import pandas as pd
import pickle 
import os

year = 1981 ## alter year to check different files

### FOR TAB DATA
# Read the .tab file
tab_file = [f for f in os.listdir(f'./{year}') if f.endswith('.tab')]
if len(tab_file) != 1:
    raise ValueError('should be only one tab file in the current directory')
print(tab_file[0])
data_number = tab_file[0].split(".")[0]

p0742.tab


#### Old code

#### New Code

In [11]:
import re
from pathlib import Path

vardict = {}
scaledict = {}

current_var = None
collecting_values = False
value_lines = []

import re

def parse_dic_metadata(dic_path):
    with open(dic_path, "r", encoding="latin-1", errors="ignore") as f:
        lines = [line.rstrip("\n") for line in f]

    vardict = {}
    scaledict = {}
    current_var = None
    collecting_values = False
    value_lines = []

    # Regex patterns
    var_header = re.compile(r"^([A-Z]+\d+[A-Z]?)\s+.*?\s+(.+?)\s+\d+$")  # variable name + label
    value_line = re.compile(r"^\s*(-?\d+)\s*(?:[A-Z]+)?\s+(.*\S)")        # value line with optional letter code
    value_section_start = re.compile(r"\bValue\b.*\bLabel\b", re.IGNORECASE)

    for line in lines:
        line = line.strip()

        # Skip blank lines and page headers or file headers
        if not line or re.match(r'^\d{2} \w{3} \d{2}', line) or line.startswith("File:"):
            continue

        # Detect new variable
        m = var_header.match(line)
        if m:
            if current_var and value_lines:
                scaledict[current_var] = "\n".join(value_lines).strip()
                value_lines = []

            current_var = m.group(1).strip()
            vardict[current_var] = m.group(2).strip()
            collecting_values = False
            continue

        # Detect start of value section
        if value_section_start.search(line):
            collecting_values = True
            continue

        # Collect value lines
        if collecting_values:
            # Skip non-value lines inside the section
            if re.match(r'^(Print Format|Write Format|Missing Values):', line):
                continue

            # Stop collecting if next variable starts
            if var_header.match(line):
                collecting_values = False
                continue

            # Match value lines
            v = value_line.match(line)
            if v:
                val, lbl = v.groups()
                value_lines.append(f"{val} = {lbl.strip()}")
            else:
                # If line is continuation of previous label
                if value_lines:
                    value_lines[-1] += " " + line.strip()

    # Save last variable
    if current_var and value_lines:
        scaledict[current_var] = "\n".join(value_lines).strip()

    return vardict, scaledict



def parse_txt_metadata(txt_path):
    # Read the corresponding var names and scales for answers
    file1 = open(txt_path, "r")
    alltext = file1.read()
    varlabels = alltext.split("VAR LABELS")[1:-1]
    varlabels = " ".join(varlabels)
    valuelabels = alltext.split("VALUE LABELS")[1:-1]
    valuelabels = " ".join(valuelabels)
    file1.close()
    
    # Clean var names and scales text and turn into dictionary
    varnames = varlabels.split("/")
    varnames = [' '.join(item.split()) for item in varnames]
    vardict = {n.split(" ")[0]: " ".join(n.split(" ")[1:]) for n in varnames}

    varscales = valuelabels.split("/")
    varscales = [' '.join(item.split()) for item in varscales]
    scaledict = {n.split(" ")[0]: " ".join(n.split(" ")[1:]) for n in varscales}
    
    return vardict, scaledict



def find_dic(year):
    dat_dir = Path(str(year)) / "dat"
    if dat_dir.exists():
        dic_files = list(dat_dir.glob("*.dic"))
        if dic_files:
            return str(dic_files[0])   # return first .dic file

    year_dir = Path(str(year))
    if year_dir.exists():
        dic_files = list(year_dir.glob("*.dic"))
        if dic_files:
            return str(dic_files[0])
    return None

def find_txt(year):
    dat_dir = Path(str(year)) / "dat"
    if dat_dir.exists():
        dic_files = list(dat_dir.glob("p*.txt"))
        if dic_files:
            return str(dic_files[0])   # return first .dic file

    year_dir = Path(str(year))
    if year_dir.exists():
        dic_files = list(year_dir.glob("p*.txt"))
        if dic_files:
            return str(dic_files[0])
    return None

dic_path = find_dic(year)
txt_path = find_txt(year)

if dic_path:
    print("dic found")
    dic_path = f"{year}/dat/{data_number}.dic"
    with open(dic_path, "r", encoding="latin-1") as f:
        text = f.read()
    vardict, scaledict = parse_dic_metadata(dic_path)
elif txt_path:
    print("txt found")
    txt_path = f"{year}/dat/{data_number}.txt"
    vardict, scaledict = parse_txt_metadata(txt_path)
else:
    print("No .dic or .txt file found")

print("Variables parsed:", len(vardict))
print("Scales parsed:", len(scaledict))

#print(vardict["VAR003"])
#print(scaledict["VAR003"])

# Save variable name dictionary to separate folder clean_data
with open(f'clean_data/vardict_{year}.pkl', 'wb') as f:
    pickle.dump(vardict, f)
with open(f'clean_data/scaledict_{year}.pkl', 'wb') as f:
    pickle.dump(scaledict, f)

with open(f'clean_data/vardict_{year}.pkl', 'rb') as f:
    loaded_vardict = pickle.load(f)
with open(f'clean_data/scaledict_{year}.pkl', 'rb') as f:
    loaded_scaledict = pickle.load(f)


txt found
Variables parsed: 385
Scales parsed: 347


In [12]:
#print(vardict)
print(loaded_scaledict)

{'VAR001': '(1)MAN-JONGEN (2)VROUW-MEISJE', 'VAR003': '(1)GEHUWD 1E KEER (2)GEHUWD,EERDER (3)ONGEHUWD (4)GESCHEIDEN (5)WEDUWE--NAAR', 'VAR006': '(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE', 'VAR007': '(1)JA (2)NEEN', 'VAR008': '(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE (-5)VR NIET GEST', 'VAR009': '(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST', 'VAR010': '(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST', 'VAR012': '(1)MINDER DAN 15 (2)15 TOT 25 (3)25 OF MEER (-6)GEEN OPGAVE', 'VAR013': '(1)GEEN BEROEP (2)OVERHEID (3)LOONDIENST (4)MW GEZINSLID (5)ZELFSTANDIG (-6)GEEN OPGAVE (-5)VR NIET GEST', 'VAR014': '(1)HUISVROUW (2)WERKLOOS (4)GEPENSIONEERD (5)MILITAIR (6)ZOEK VH EERST (7)SCHOLIER-STUD (3)WAO-AAW (8)ANDERS (-6)GEEN OPGAVE (-5)VR NIET GEST', 'VAR015': '(1)

In [13]:
# Keep variable IDs as columns
df = pd.read_csv(f'{year}/{tab_file[0]}', sep='\t')

# Build MultiIndex tuples
multi_columns = []
for var in df.columns:
    var_id = var
    var_question = loaded_vardict.get(var, "")
    var_scale = loaded_scaledict.get(var, "")
    multi_columns.append((var_id, var_question, var_scale))

# Create the MultiIndex
df.columns = pd.MultiIndex.from_tuples(multi_columns,
                                       names=["Variable", "Question", "Scale"])

df.head()

Variable,RESPNR,VAR001,VAR002,VAR003,VAR006,VAR007,VAR008,VAR009,VAR010,VAR012,...,VAR921,VAR922,VAR923,VAR924,VAR925,VAR926,VAR927,VAR928,VAR929,OCT462
Question,Unnamed: 1_level_1,SEXE RESPONDENT,LEEFTIJD RESPONDENT,BURGERLIJKE STAAT,LAATST GENOTEN OPLEIDING,LAATST GENOTEN OPLEIDING VOLTOOID,LAATST VOLTOOIDE OPLEIDING,LAATST VOLTOOIDE OPLEIDING HOOFD VH GEZI,LAATST VOLTOOIDE OPLEIDING VADER,AANTAL UREN AAN HET WERK PER WEEK,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Scale,Unnamed: 1_level_2,(1)MAN-JONGEN (2)VROUW-MEISJE,Unnamed: 3_level_2,"(1)GEHUWD 1E KEER (2)GEHUWD,EERDER (3)ONGEHUWD (4)GESCHEIDEN (5)WEDUWE--NAAR",(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE,(1)JA (2)NEEN,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)MINDER DAN 15 (2)15 TOT 25 (3)25 OF MEER (-6)GEEN OPGAVE,...,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1,1,23,3,5,1,-5,-5,7,3,...,0,0,0,0,0,0,0,0,0,1
1,2,1,54,3,7,2,4,-5,7,3,...,0,0,0,0,0,0,0,0,0,1
2,3,2,35,3,7,1,-5,-5,7,-6,...,0,0,0,0,0,0,0,0,0,3
3,4,2,55,1,5,1,-5,7,-6,3,...,0,0,0,0,0,0,0,0,0,0
4,6,2,73,5,1,1,-5,-5,1,3,...,0,0,0,0,0,0,0,0,0,2


In [14]:
# ---- FLATTEN MULTIINDEX FOR CSV OUTPUT ----
# Format: VAR003 | BURGERLIJKE STAAT | (1)... etc
df_flat = df.copy()
df_flat.columns = [' | '.join(filter(None, col)) for col in df.columns.to_list()]

# ---- SAVE TO CSV ----
output_path = f"clean_data/{year}.csv"
df_flat.to_csv(output_path, index=False)

print(f"Saved dataframe to {output_path}")

Saved dataframe to clean_data/1981.csv


In [15]:
for item in loaded_vardict:
    if "PROTEST" in loaded_vardict[item]:
        print(loaded_vardict[item])
    if "DEMONSTR" in loaded_vardict[item]:
        print(loaded_vardict[item])
    if "VRIJHEID" in loaded_vardict[item]:
        print(loaded_vardict[item])
    if "protest" in loaded_vardict[item]:
        print(loaded_vardict[item])
    if "demonstr" in loaded_vardict[item]:
        print(loaded_vardict[item])
    if "vrijheid" in loaded_vardict[item]:
        print(loaded_vardict[item])

VRIJHEID TOT DEMONSTREREN
VRIJHEID TOT DEMONSTREREN
VRIJHEID TOT OPEN KRITIEK OP KONINGSHUIS
VRIJHEID TOT STAKEN VOOR LOONSVERHOGING
VRIJHEID TOT WEIGERING MILITAIRE DIENST
VRIJHEID TOT BEZETTING VAN GEBOUWEN
VRIJHEID TOT OPENB SCHRIJVEN WAT MEN WIL
VRIJHEID TOT OPENB ZEGGEN WAT MEN WIL
WET ONRECHTV: ZITDEMONSTR,PROTESTOPT,ENZ
WET ONRECHTV: ZITDEMONSTR,PROTESTOPT,ENZ
POLITIE ARRESTEERT DEELNEMERS PROTEST
PROTESTANTEN IN DOEN EN LATEN VRIJLATEN


In [16]:
df.head()

Variable,RESPNR,VAR001,VAR002,VAR003,VAR006,VAR007,VAR008,VAR009,VAR010,VAR012,...,VAR921,VAR922,VAR923,VAR924,VAR925,VAR926,VAR927,VAR928,VAR929,OCT462
Question,Unnamed: 1_level_1,SEXE RESPONDENT,LEEFTIJD RESPONDENT,BURGERLIJKE STAAT,LAATST GENOTEN OPLEIDING,LAATST GENOTEN OPLEIDING VOLTOOID,LAATST VOLTOOIDE OPLEIDING,LAATST VOLTOOIDE OPLEIDING HOOFD VH GEZI,LAATST VOLTOOIDE OPLEIDING VADER,AANTAL UREN AAN HET WERK PER WEEK,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Scale,Unnamed: 1_level_2,(1)MAN-JONGEN (2)VROUW-MEISJE,Unnamed: 3_level_2,"(1)GEHUWD 1E KEER (2)GEHUWD,EERDER (3)ONGEHUWD (4)GESCHEIDEN (5)WEDUWE--NAAR",(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE,(1)JA (2)NEEN,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)LO (2)LO+VAKOND. (3)VAO (4)VAO+VAKOND. (5)MO (6)MO-HBO+VAKOND (7)WO (-3)WEET NIET (-6)GEEN OPGAVE (-5)VR NIET GEST,(1)MINDER DAN 15 (2)15 TOT 25 (3)25 OF MEER (-6)GEEN OPGAVE,...,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1,1,23,3,5,1,-5,-5,7,3,...,0,0,0,0,0,0,0,0,0,1
1,2,1,54,3,7,2,4,-5,7,3,...,0,0,0,0,0,0,0,0,0,1
2,3,2,35,3,7,1,-5,-5,7,-6,...,0,0,0,0,0,0,0,0,0,3
3,4,2,55,1,5,1,-5,7,-6,3,...,0,0,0,0,0,0,0,0,0,0
4,6,2,73,5,1,1,-5,-5,1,3,...,0,0,0,0,0,0,0,0,0,2
