In [2]:
import os
import pandas as pd

# Base directory
base_dir = r"C:\dhs data\datasets"

# List of dataset folders
datasets = ["ZMAR71DT", "ZMHR71DT", "ZMIR71DT", "ZMPR71DT", "ZMTB5ADTSR"]

# Process each dataset
for dataset in datasets:
    dataset_dir = os.path.join(base_dir, dataset)
    
    print(f"\nChecking dataset: {dataset}")
    print(f"Full path: {dataset_dir}")
    
    # Check if the directory exists
    if not os.path.exists(dataset_dir):
        print(f"Error: Directory does not exist: {dataset_dir}")
        continue
    
    # List all files in the directory
    files = os.listdir(dataset_dir)
    print(f"Files found in directory:")
    for file in files:
        print(f"  - {file}")
    
    # Find the .dta file in the dataset directory
    dta_files = [f for f in files if f.lower().endswith('.dta')]
    
    if dta_files:
        dta_file = dta_files[0]  # Use the first .dta file if multiple exist
        dta_path = os.path.join(dataset_dir, dta_file)
        
        print(f"Found .dta file: {dta_file}")
        
        try:
            # Load the .dta file with convert_categoricals set to False
            df = pd.read_stata(dta_path, convert_categoricals=False)
            
            # Print basic information about the dataset
            print(f"Successfully loaded dataset.")
            print(f"Number of rows: {len(df)}")
            print(f"Number of columns: {len(df.columns)}")
            print("First few rows:")
            print(df.head())
            
            # Export to CSV
            csv_output = os.path.join(base_dir, f"{dataset}_output.csv")
            df.to_csv(csv_output, index=False)
            print(f"CSV exported to: {csv_output}")
        except Exception as e:
            print(f"Error loading or processing file: {str(e)}")
            print("Attempting to load without converting dtypes...")
            try:
                # Try loading without converting any dtypes
                df = pd.read_stata(dta_path, convert_dtype=False)
                print(f"Successfully loaded dataset without converting dtypes.")
                print(f"Number of rows: {len(df)}")
                print(f"Number of columns: {len(df.columns)}")
                print("First few rows:")
                print(df.head())
                
                # Export to CSV
                csv_output = os.path.join(base_dir, f"{dataset}_output.csv")
                df.to_csv(csv_output, index=False)
                print(f"CSV exported to: {csv_output}")
            except Exception as e:
                print(f"Error loading file without converting dtypes: {str(e)}")
    else:
        print(f"No .dta file found in {dataset_dir}")

print("\nAll datasets processed.")


Checking dataset: ZMAR71DT
Full path: C:\dhs data\datasets\ZMAR71DT
Files found in directory:
  - ZMAR71FL.DCT
  - ZMAR71FL.DO
  - ZMAR71FL.DTA
  - ZMAR71FL.MAP
Found .dta file: ZMAR71FL.DTA
Successfully loaded dataset.
Number of rows: 25418
Number of columns: 10
First few rows:
   hivclust  hivnumb  hivline  hiv01  hiv02          hiv03    hiv05  \
0         1        1        2  K8Z3V   9850   hiv negative  1826955   
1         1        1        1  U6C4H   9843   hiv negative  1956200   
2         1        2        2  H7D9B   9851   hiv negative  1826955   
3         1        2        1  V0Z3Y   9881   hiv negative  1956200   
4         1        3        2  R7O7U   9848  hiv  positive  1826955   

           hiv06          hiv07          hiv08  
0   hiv negative            NaN            NaN  
1   hiv negative   hiv negative            NaN  
2   hiv negative            NaN            NaN  
3   hiv negative            NaN            NaN  
4  hiv  positive  hiv  positive  hiv  positive 