In [2]:
import zipfile
import pandas as pd
import io
import os

print("Imports completed")

def parse_metadata_file(metadata_file):
    """
    Parse DHS metadata files (.dcf, .map, etc.) to extract variable information.
    """
    variables = []
    current_variable = None
    in_variable_block = False

    for line in metadata_file:
        line = line.strip()
        
        if line.startswith("["):
            # New section (could be a variable or something else)
            in_variable_block = line == "[Variable]"
            if in_variable_block:
                current_variable = {}
        elif in_variable_block:
            if "=" in line:
                key, value = line.split("=", 1)
                key = key.strip().lower()
                value = value.strip().strip('"')
                
                if key == "name":
                    current_variable["name"] = value
                elif key == "label":
                    current_variable["label"] = value
                elif key == "start":
                    current_variable["start"] = int(value)
                elif key == "len":
                    current_variable["length"] = int(value)
            elif line == "":
                # Empty line indicates end of variable block
                if current_variable and "name" in current_variable:
                    variables.append(current_variable)
                current_variable = None
                in_variable_block = False

    # Add the last variable if the file ends without a blank line
    if current_variable and "name" in current_variable:
        variables.append(current_variable)

    # Sort variables by their start position
    variables.sort(key=lambda v: v.get("start", 0))

    return variables

def load_dat_with_metadata(dat_file, metadata_file):
    """
    Load .dat file with the corresponding metadata (.dcf, .map, etc.).
    """
    variables = parse_metadata_file(metadata_file)

    # Prepare column specifications for pd.read_fwf
    colspecs = []
    names = []
    for var in variables:
        start = var.get("start", 0) - 1  # Adjust for 0-based index
        length = var.get("length", 1)
        colspecs.append((start, start + length))
        names.append(var["name"])

    # Load the .dat file as fixed-width format
    df = pd.read_fwf(dat_file, colspecs=colspecs, names=names, encoding='latin1')
    
    return df

def save_to_csv(data_dict, output_folder):
    """
    Save the data dictionary (datasets) into CSV files.
    """
    os.makedirs(output_folder, exist_ok=True)
    for dataset_name, df in data_dict.items():
        csv_file_path = os.path.join(output_folder, f"{dataset_name}.csv")
        df.to_csv(csv_file_path, index=False)
        print(f"Saved {dataset_name} to {csv_file_path}")

def load_data_from_zip(zip_path, output_folder):
    print(f"Starting to load data from: {zip_path}")
    data_dict = {}
    
    with zipfile.ZipFile(zip_path, 'r') as z:
        print(f"Zip file opened: {zip_path}")
        
        # Find all .dat files
        dat_files = [f for f in z.namelist() if f.lower().endswith('.dat')]
        
        for dat_file in dat_files:
            print(f"Found .dat file: {dat_file}")
            dat_dir = os.path.dirname(dat_file)
            dat_base = os.path.splitext(os.path.basename(dat_file))[0]
            
            # Search for metadata files
            metadata_file = None
            for f in z.namelist():
                f_dir = os.path.dirname(f)
                f_base = os.path.splitext(os.path.basename(f))[0]
                if f_dir == dat_dir and f_base.startswith(dat_base) and f.lower().endswith(('.dct', '.dcf', '.map', '.sas', '.sps', '.do')):
                    metadata_file = f
                    break
            
            if metadata_file:
                print(f"Found corresponding metadata file: {metadata_file}")
                with z.open(dat_file) as dat_f, z.open(metadata_file) as meta_f:
                    df = load_dat_with_metadata(io.TextIOWrapper(dat_f), io.TextIOWrapper(meta_f))
                    key = os.path.splitext(os.path.basename(dat_file))[0]
                    data_dict[key] = df
                    print(f"Added to data_dict with key: {key}")
            else:
                print(f"No corresponding metadata file found for: {dat_file}")
    
    save_to_csv(data_dict, output_folder)
    
    print(f"Finished loading data from: {zip_path}. Total datasets loaded: {len(data_dict)}")
    return data_dict

# Paths to your zip files
zip_files = ['C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip', 'C:/dhs data/ZM_2018_DHS_09242024_89_219681.zip']
output_folder = 'C:/dhs_data/processed_csv'

# Load all datasets and save them as CSVs
for zip_file in zip_files:
    print(f"Processing zip file: {zip_file}")
    load_data_from_zip(zip_file, output_folder)

print("Script execution completed")


Imports completed
Processing zip file: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Starting to load data from: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Zip file opened: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Examining file: ZMMS5AFLSR/ZMSR5A.DOC
Examining file: ZMTB5AFLSR/ZMSR5A.DOC
Examining file: ZMTB5AFLSR/ZMTB5AFLSR.SAS
Examining file: ZMTB5AFLSR/ZMSR5A.XLS
Examining file: ZMMS5AFLSR/
Examining file: ZMTB5AFLSR/ZMSR5A.MAP
Examining file: ZMTB5AFLSR/
Examining file: ZMMS5AFLSR/ZMMS5AFLSR.SAS
Examining file: ZMMS5AFLSR/ZMMS5AFLSR.DAT
Found .dat file: ZMMS5AFLSR/ZMMS5AFLSR.DAT
No corresponding metadata file found for: ZMMS5AFLSR/ZMMS5AFLSR.DAT
Examining file: ZMMS5AFLSR/ZMSR5A.FRQ
Examining file: ZMTB5AFLSR/ZMSR5A.FRQ
Examining file: ZMTB5AFLSR/ZMTB5AFLSR.DAT
Found .dat file: ZMTB5AFLSR/ZMTB5AFLSR.DAT
No corresponding metadata file found for: ZMTB5AFLSR/ZMTB5AFLSR.DAT
Examining file: ZMMS5AFLSR/ZMSR5A.XLS
Examining file: ZMTB5AFLSR/ZMTB5AFLSR.DCT
Examining