In [None]:
# Data import cell (written with help from Google Gemini)
import gzip
import shutil

# Define the paths for the gzipped and uncompressed files
gzipped_file_path = '/content/2020_pm25_daily_average.txt.gz' # Downloaded from https://www.epa.gov/hesc/rsig-related-downloadable-data-files
uncompressed_file_path = '2020_pm25_daily_average.txt'

try:
    # Open the gzipped file in binary read mode ('rb')
    with gzip.open(gzipped_file_path, 'rb') as f_in:
        # Open the output file in binary write mode ('wb')
        with open(uncompressed_file_path, 'wb') as f_out:
            # Copy the decompressed content from the input to the output file
            shutil.copyfileobj(f_in, f_out)
    print(f"Successfully unzipped '{gzipped_file_path}' to '{uncompressed_file_path}'")
except FileNotFoundError:
    print(f"Error: The file '{gzipped_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred during decompression: {e}")

Successfully unzipped '/content/2020_pm25_daily_average.txt.gz' to '2020_pm25_daily_average.txt'


In [None]:
# Load data as dataframe
import pandas
data = pandas.read_csv("/content/2020_pm25_daily_average.txt", dtype={"FIPS":str})
data

Unnamed: 0,Date,FIPS,Longitude,Latitude,pm25_daily_average(ug/m3),pm25_daily_average_stderr(ug/m3)
0,2020/01/01,01001020100,-86.49007,32.47718,9.473,5.5130
1,2020/01/01,01001020200,-86.47337,32.47434,9.971,5.6818
2,2020/01/01,01001020300,-86.46019,32.47543,9.735,5.5539
3,2020/01/01,01001020400,-86.44363,32.47200,9.869,5.5489
4,2020/01/01,01001020501,-86.42256,32.44786,10.053,5.3036
...,...,...,...,...,...,...
30662011,2020/12/31,56043000200,-107.68105,43.90472,1.807,1.1694
30662012,2020/12/31,56043000301,-107.95638,44.01437,2.355,1.5672
30662013,2020/12/31,56043000302,-107.94963,44.02720,2.326,1.4547
30662014,2020/12/31,56045951100,-104.57354,43.83987,2.058,1.4051


In [None]:
len(data['Date'].unique()) # Confirm number of days in data, ensure we loaded full dataset (2020 = leap year)

366

In [None]:
grouped = data.groupby(by="FIPS")[["pm25_daily_average(ug/m3)"]].mean() # annual average concentration grouped by blockgroup
grouped

Unnamed: 0_level_0,pm25_daily_average(ug/m3)
FIPS,Unnamed: 1_level_1
01001020100,8.969057
01001020200,8.998268
01001020300,9.035760
01001020400,9.065383
01001020501,9.141880
...,...
56043000200,4.383918
56043000301,4.677708
56043000302,4.696462
56045951100,4.743926


In [None]:
grouped.describe() # From tech doc: "In EJScreen, the raw values for the PM 2.5 indicator range from 3.28858 to 25.6121.  The percentiles for the PM 2.5 indicator range from 0 to 100% with a median at the 50th percentile, which corresponds to the raw value of 8.209934"

Unnamed: 0,pm25_daily_average(ug/m3)
count,83776.0
mean,8.44832
std,2.054838
min,3.288574
25%,7.298417
50%,8.159527
75%,9.017068
max,25.612107
