In [4]:
import pandas as pd
import os

# Define the folder paths
folder_paths = ["DP03"]

# Define the columns you want to exclude from each folder
exclude_columns_dp02 = ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE']
exclude_columns_dp03 = ['DP03_0025PE', 'DP03_0062PE', 'DP03_0063PE', 'DP03_0065PE', 'DP03_0067PE', 'DP03_0069PE', 'DP03_0071PE', 'DP03_0073PE', 'DP03_0086PE', 'DP03_0087PE', 'DP03_0088PE', 'DP03_0090PE', 'DP03_0091PE', 'DP03_0092PE', 'DP03_0093PE', 'DP03_0094PE']

for folder_path in folder_paths:
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):  # Ensure it's a CSV file
            file_path = os.path.join(folder_path, file)

            # Load the CSV
            df = pd.read_csv(file_path)

            # Extract ZCTA from the "GEO_ID" column (assumes it's the first column)
            df['GEO_ID'] = df['GEO_ID'].astype(str).str[-5:]

            # Determine which columns to keep (exclude "PE" columns and include corresponding "E" columns)
            if folder_path == "DP02":
                exclude_columns = exclude_columns_dp02
            elif folder_path == "DP03":
                exclude_columns = exclude_columns_dp03

            # Include the first two columns, those ending in "PE" that are not in the exclude list, and their "E" equivalents
            selected_columns = df.columns[:2].tolist() + [
                col.replace('PE', 'E') for col in df.columns if col.endswith("PE") and col not in exclude_columns
            ]
            df_filtered = df[selected_columns]

            # Remove rows with Geography ending in 00600-00999 (Puerto Rico ZCTAs with no DP02 data)
            pattern = r"8600000US(00[6-9][0-9][0-9])$"
            df_filtered = df_filtered[~df_filtered["GEO_ID"].astype(str).str.match(pattern, na=False)]

            # Save the filtered data back to CSV
            df_filtered.to_csv(file_path, index=False)
            print(f"Processed: {file}")


Processed: ACSDP5Y2011.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed: ACSDP5Y2012.DP03-Data.csv
Processed: ACSDP5Y2013.DP03-Data.csv
Processed: ACSDP5Y2014.DP03-Data.csv
Processed: ACSDP5Y2015.DP03-Data.csv
Processed: ACSDP5Y2016.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed: ACSDP5Y2017.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed: ACSDP5Y2018.DP03-Data.csv
Processed: ACSDP5Y2019.DP03-Data.csv
Processed: ACSDP5Y2020.DP03-Data.csv
Processed: ACSDP5Y2021.DP03-Data.csv
Processed: ACSDP5Y2022.DP03-Data.csv
Processed: ACSDP5Y2023.DP03-Data.csv


In [None]:
# Here we are listing the columns that were not collected in a given year 
# this is to analyze this missing data 
# the missing columns in DP03 relate to benefits
for folder_path in folder_paths:
    print (folder_path)
    missing_columns_by_year = {}

    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)

            df = pd.read_csv(file_path)

            year = file.split('Y')[1][:4]        

            second_row = df.iloc[1]
            
            # Find columns that contain "(X)" in the second row, they were not collected in the given year
            missing_columns = second_row[second_row == "(X)"].index.tolist()
            
            missing_columns_by_year[year] = missing_columns

    for year, columns in sorted(missing_columns_by_year.items()):
        print(f"Year {year}: Missing columns -> {columns}")

DP02


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Year 2011: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE', 'DP02_0070PE', 'DP02_0071PE', 'DP02_0072PE', 'DP02_0073PE', 'DP02_0074PE', 'DP02_0075PE', 'DP02_0076PE', 'DP02_0077PE']
Year 2012: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE']
Year 2013: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE', 'DP02_0150PE', 'DP02_0151PE', 'DP02_0152PE']
Year 2014: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE', 'DP02_0150PE', 'DP02_0151PE', 'DP02_0152PE']
Year 2015: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE', 'DP02_0150PE', 'DP02_0151PE', 'DP02_0152PE']
Year 2016: Missing columns -> ['DP02_0015PE', 'DP02_0016PE', 'DP0

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Year 2011: Missing columns -> ['DP03_0025PE', 'DP03_0062PE', 'DP03_0063PE', 'DP03_0065PE', 'DP03_0067PE', 'DP03_0069PE', 'DP03_0071PE', 'DP03_0073PE', 'DP03_0086PE', 'DP03_0087PE', 'DP03_0088PE', 'DP03_0090PE', 'DP03_0091PE', 'DP03_0092PE', 'DP03_0093PE', 'DP03_0094PE', 'DP03_0095PE', 'DP03_0096PE', 'DP03_0097PE', 'DP03_0098PE', 'DP03_0099PE', 'DP03_0100PE', 'DP03_0101PE', 'DP03_0102PE', 'DP03_0103PE', 'DP03_0104PE', 'DP03_0105PE', 'DP03_0106PE', 'DP03_0107PE', 'DP03_0108PE', 'DP03_0109PE', 'DP03_0110PE', 'DP03_0111PE', 'DP03_0112PE', 'DP03_0113PE', 'DP03_0114PE', 'DP03_0115PE', 'DP03_0116PE', 'DP03_0117PE', 'DP03_0118PE']
Year 2012: Missing columns -> ['DP03_0025PE', 'DP03_0062PE', 'DP03_0063PE', 'DP03_0065PE', 'DP03_0067PE', 'DP03_0069PE', 'DP03_0071PE', 'DP03_0073PE', 'DP03_0086PE', 'DP03_0087PE', 'DP03_0088PE', 'DP03_0090PE', 'DP03_0091PE', 'DP03_0092PE', 'DP03_0093PE', 'DP03_0094PE']
Year 2013: Missing columns -> ['DP03_0025PE', 'DP03_0062PE', 'DP03_0063PE', 'DP03_0065PE', 'DP03_0

In [9]:
import os
import pandas as pd

# Path to your folder containing the files
folder_path = 'D'

# List of columns to drop
columns_to_drop = ['DP02_0015PE', 'DP02_0016PE', 'DP02_0038PE', 'DP02_0039PE', 'DP02_0040PE', 'DP02_0041PE', 'DP02_0042PE', 'DP02_0043PE', 'DP02_0070PE', 'DP02_0071PE', 'DP02_0072PE', 'DP02_0073PE', 'DP02_0074PE', 'DP02_0075PE', 'DP02_0076PE', 'DP02_0077PE', 'DP02_0150PE', 'DP02_0151PE', 'DP02_0152PE', 'DP02_0003PE', 'DP02_0005PE', 'DP02_0007PE', 'DP02_0011PE', 'DP02_0017PE']

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    # Check if the file is a CSV (or adjust based on your file type)
    if filename.endswith('.csv'):
        # Load the file into a DataFrame
        df = pd.read_csv(file_path)
        df = df.apply(pd.to_numeric, errors='coerce')
        df_columns_to_normalize = ['DP03_0062E','DP03_0063E']
        for column in df_columns_to_normalize:
            min_val = df[column].min()
            max_val = df[column].max()
            df[column] = 1 + ((df[column] - min_val) * (100 - 1)) / (max_val - min_val)
        # Save the modified DataFrame back to the same file (or a new file)
        df.to_csv(file_path, index=False)

        print(f"Processed {filename}")

print("All files processed.")


Processed ACSDP5Y2011.DP03-Data.csv
Processed ACSDP5Y2012.DP03-Data.csv
Processed ACSDP5Y2013.DP03-Data.csv
Processed ACSDP5Y2014.DP03-Data.csv
Processed ACSDP5Y2015.DP03-Data.csv
Processed ACSDP5Y2016.DP03-Data.csv
Processed ACSDP5Y2017.DP03-Data.csv
Processed ACSDP5Y2018.DP03-Data.csv
Processed ACSDP5Y2019.DP03-Data.csv
Processed ACSDP5Y2020.DP03-Data.csv
Processed ACSDP5Y2021.DP03-Data.csv
Processed ACSDP5Y2022.DP03-Data.csv
Processed ACSDP5Y2023.DP03-Data.csv
All files processed.


In [10]:
import pandas as pd
import os

# Define folder paths
dp03_folder = "D"
data_dp03_folder = "data/DP03"

# Ensure the target folder exists
os.makedirs(data_dp03_folder, exist_ok=True)

# Loop through all files in the DP03 folder
for file in os.listdir(dp03_folder):
    if file.endswith(".csv"):  # Ensure it's a CSV file
        # Define paths for both DP03 and data/dp03
        dp03_file_path = os.path.join(dp03_folder, file)
        data_file_path = os.path.join(data_dp03_folder, file)
        
        # Check if the corresponding file exists in the data/dp03 folder
        if os.path.exists(data_file_path):
            # Load both files as DataFrames
            df_dp03 = pd.read_csv(dp03_file_path)
            df_data = pd.read_csv(data_file_path)
            
            # Extract the columns from the DP03 file (e.g., columns ending in 'E')
            columns_to_attach = [col for col in df_dp03.columns if col.endswith('E')]
            
            # Attach the columns to the data file (from data/dp03 folder)
            df_data[columns_to_attach] = df_dp03[columns_to_attach]
            
            # Save the updated file back to the data/dp03 folder
            df_data.to_csv(data_file_path, index=False)
            print(f"Processed and saved: {file}")
        else:
            print(f"File {file} does not exist in data/dp03")


  interactivity=interactivity, compiler=compiler, result=result)


Processed and saved: ACSDP5Y2011.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed and saved: ACSDP5Y2012.DP03-Data.csv
Processed and saved: ACSDP5Y2013.DP03-Data.csv
Processed and saved: ACSDP5Y2014.DP03-Data.csv
Processed and saved: ACSDP5Y2015.DP03-Data.csv
Processed and saved: ACSDP5Y2016.DP03-Data.csv
Processed and saved: ACSDP5Y2017.DP03-Data.csv
Processed and saved: ACSDP5Y2018.DP03-Data.csv
Processed and saved: ACSDP5Y2019.DP03-Data.csv
Processed and saved: ACSDP5Y2020.DP03-Data.csv
Processed and saved: ACSDP5Y2021.DP03-Data.csv
Processed and saved: ACSDP5Y2022.DP03-Data.csv
Processed and saved: ACSDP5Y2023.DP03-Data.csv


In [13]:
import pandas as pd
import os

# Define the folder path for data/dp03
data_dp03_folder = "data/dp02"

# List of columns to drop from DP03 data
columns_to_drop = [
    'DP03_0025PE', 'DP03_0062PE', 'DP03_0063PE', 'DP03_0065PE', 'DP03_0067PE', 'DP03_0069PE',
    'DP03_0071PE', 'DP03_0073PE', 'DP03_0086PE', 'DP03_0087PE', 'DP03_0088PE', 'DP03_0090PE',
    'DP03_0091PE', 'DP03_0092PE', 'DP03_0093PE', 'DP03_0094PE', 'DP03_0095PE', 'DP03_0096PE',
    'DP03_0097PE', 'DP03_0098PE', 'DP03_0099PE', 'DP03_0100PE', 'DP03_0101PE', 'DP03_0102PE',
    'DP03_0103PE', 'DP03_0104PE', 'DP03_0105PE', 'DP03_0106PE', 'DP03_0107PE', 'DP03_0108PE',
    'DP03_0109PE', 'DP03_0110PE', 'DP03_0111PE', 'DP03_0112PE', 'DP03_0113PE', 'DP03_0114PE',
    'DP03_0115PE', 'DP03_0116PE', 'DP03_0117PE', 'DP03_0118PE'
]

# Ensure the target folder exists
os.makedirs(data_dp03_folder, exist_ok=True)

# Loop through all files in the data/dp03 folder
for file in os.listdir(data_dp03_folder):
    if file.endswith(".csv"):  # Ensure it's a CSV file
        # Define path for the file in the data/dp03 folder
        data_file_path = os.path.join(data_dp03_folder, file)

        # Load the CSV file as a DataFrame
        df_data = pd.read_csv(data_file_path)

        # Drop the specified columns from the DataFrame
        df_data['GEO_ID'] = df_data['GEO_ID'].astype(str).str[-5:]
        # Save the modified file back to the data/dp03 folder
        df_data.to_csv(data_file_path, index=False)
        print(f"Processed and saved: {file}")


  interactivity=interactivity, compiler=compiler, result=result)


Processed and saved: ACSDP5Y2011.DP02-Data.csv
Processed and saved: ACSDP5Y2012.DP02-Data.csv
Processed and saved: ACSDP5Y2013.DP02-Data.csv
Processed and saved: ACSDP5Y2014.DP02-Data.csv
Processed and saved: ACSDP5Y2015.DP02-Data.csv
Processed and saved: ACSDP5Y2016.DP02-Data.csv
Processed and saved: ACSDP5Y2017.DP02-Data.csv
Processed and saved: ACSDP5Y2018.DP02-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed and saved: ACSDP5Y2019.DP02-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed and saved: ACSDP5Y2020.DP02-Data.csv
Processed and saved: ACSDP5Y2021.DP02-Data.csv
Processed and saved: ACSDP5Y2022.DP02-Data.csv
Processed and saved: ACSDP5Y2023.DP02-Data.csv


In [7]:
import os
import pandas as pd

# Path to your folder containing the files
folder_path = 'D'

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    # Check if the file is a CSV (or adjust based on your file type)
    if filename.endswith('.csv'):
        # Load the file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Filter columns that start with 'DP03'
        dp03_columns = [col for col in df.columns if col.endswith('62E') or col.endswith('63E')]
        df_dp03 = df[dp03_columns]
        
        # Save the filtered DataFrame back to the same file (or a new file)
        df_dp03.to_csv(file_path, index=False)

        print(f"Processed {filename}")

print("All files processed.")


  interactivity=interactivity, compiler=compiler, result=result)


Processed ACSDP5Y2011.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed ACSDP5Y2012.DP03-Data.csv
Processed ACSDP5Y2013.DP03-Data.csv
Processed ACSDP5Y2014.DP03-Data.csv
Processed ACSDP5Y2015.DP03-Data.csv
Processed ACSDP5Y2016.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed ACSDP5Y2017.DP03-Data.csv


  interactivity=interactivity, compiler=compiler, result=result)


Processed ACSDP5Y2018.DP03-Data.csv
Processed ACSDP5Y2019.DP03-Data.csv
Processed ACSDP5Y2020.DP03-Data.csv
Processed ACSDP5Y2021.DP03-Data.csv
Processed ACSDP5Y2022.DP03-Data.csv
Processed ACSDP5Y2023.DP03-Data.csv
All files processed.
