In [6]:
import zipfile
import pandas as pd
import io
import os
import csv

print("Imports completed")


def load_data_from_zip(zip_path):
    print(f"Starting to load data from: {zip_path}")
    data_dict = {}
    with zipfile.ZipFile(zip_path, 'r') as z:
        print(f"Zip file opened: {zip_path}")
        for filename in z.namelist():
            print(f"Examining file: {filename}")
            if filename.lower().endswith('.dat'):
                print(f"Found .dat file: {filename}")
                with z.open(filename) as f:
                    print(f"Reading file: {filename}")
                    # Read the .dat file using csv module
                    csv_reader = csv.reader(io.TextIOWrapper(f), delimiter=' ', skipinitialspace=True)
                    data = [row for row in csv_reader if row]  # Skip empty rows
                    
                    # Convert to DataFrame, letting pandas infer the structure
                    df = pd.DataFrame(data)
                    print(f"File read into DataFrame. Shape: {df.shape}")
                    
                    # Use the filename (without extension) as the key in the dictionary
                    key = os.path.splitext(os.path.basename(filename))[0]
                    data_dict[key] = df
                    print(f"Added to data_dict with key: {key}")
    print(f"Finished loading data from: {zip_path}. Total datasets loaded: {len(data_dict)}")
    return data_dict

# Paths to your zip files
zip_files = ['C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip', 'C:/dhs data/ZM_2018_DHS_09242024_89_219681.zip']
print(f"Zip files to process: {zip_files}")

# Load all datasets
all_data = {}
for zip_file in zip_files:
    print(f"Processing zip file: {zip_file}")
    all_data.update(load_data_from_zip(zip_file))

print(f"Total datasets loaded across all zip files: {len(all_data)}")

# Now you can explore each dataset
for dataset_name, df in all_data.items():
    print(f"\nExploring dataset: {dataset_name}")
    print(f"Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())
    print("\nColumn data types:")
    print(df.dtypes)
    print("\nSummary statistics:")
    print(df.describe())
    print("-" * 50)

print("Script execution completed")

Imports completed
Zip files to process: ['C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip', 'C:/dhs data/ZM_2018_DHS_09242024_89_219681.zip']
Processing zip file: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Starting to load data from: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Zip file opened: C:/dhs data/ZM_2005_HIVSPA_09242024_814_219681.zip
Examining file: ZMMS5AFLSR/ZMSR5A.DOC
Examining file: ZMTB5AFLSR/ZMSR5A.DOC
Examining file: ZMTB5AFLSR/ZMTB5AFLSR.SAS
Examining file: ZMTB5AFLSR/ZMSR5A.XLS
Examining file: ZMMS5AFLSR/
Examining file: ZMTB5AFLSR/ZMSR5A.MAP
Examining file: ZMTB5AFLSR/
Examining file: ZMMS5AFLSR/ZMMS5AFLSR.SAS
Examining file: ZMMS5AFLSR/ZMMS5AFLSR.DAT
Found .dat file: ZMMS5AFLSR/ZMMS5AFLSR.DAT
Reading file: ZMMS5AFLSR/ZMMS5AFLSR.DAT
File read into DataFrame. Shape: (430, 61)
Added to data_dict with key: ZMMS5AFLSR
Examining file: ZMMS5AFLSR/ZMSR5A.FRQ
Examining file: ZMTB5AFLSR/ZMSR5A.FRQ
Examining file: ZMTB5AFLSR/ZMTB5AFLSR.DAT
Found .dat file: Z