In [1]:
import pandas as pd
import geopandas as gpd
import csv
import os
from datetime import datetime, timedelta
import glob
from string import Template
import requests
from urllib.request import urlopen
from io import StringIO

airnow_station_url="https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/today/Monitoring_Site_Locations_V2.dat"

base_url='http://jtimmer.digitalspacemail17.net/data/'
current_file='current.CSV'
pattern_file='yesterday_$filedate.CSV'

# yesterday_20241002.CSV

s3_bucket=os.getenv('PUBLIC_BUCKET', 'resilient-public')
s3_data_path='sd_apcd_air/source'
s3_output_path='sd_apcd_air/output'

so2_parameter = '28 SO2 Tr PPB'
h2s_parameter = '07 H2S PPB'

outputs=[
    {'parameter':"01 OZONE PPM", 'name':"01 OZONE PPM", 'file':"o2"},
    {'parameter':"28 SO2 Tr PPB", 'name':"S02 PPB", 'file':"s02"},
    {'parameter':"07 H2S PPB", 'name':"H2S PPM", 'file':"h2s2"},
    {'parameter':"11 PM2.5 �g/M3", 'name':"PM2.5 microg/M3", 'file':"pm25"},
    {'parameter':"PM10 STD", 'name':"PM10 STD", 'file':"pm10"},
    
]

In [2]:

def process_csv_files(file_paths):
    transformed_data = []

    with requests.Session() as s:
        for file_path in file_paths:
        # Read the date from the third row        
         
            response=s.get(file_path)
            data=response.text
            lines= data.splitlines()
            
            date_str = lines[2] # third line 0 base
            if ','  in date_str:
                date_str  = date_str.strip().split('),')[1]  # Get date from third row
                # Parse the date
            date = datetime.strptime(date_str.strip(), '%m/%d/%Y')
    
            hours_header = lines[3]  # First row with parameter names
            parameter_header=lines[4]  # Skip second row parmeters
            #next(csv_reader)  # Skip third row (date)
            #parameter_header = next(csv_reader)  # Fourth row with hour headers
            
            # Find the index where hour columns start
            hour_start_index = hours_header.index('0')
            #parameter_index = parameter_header.index('Parameter')
            #site_index = parameter_header.index('SiteName')
            parameter_index = 0
            site_index = 1
            # Process each row
            for row in lines[5:]:
                row = row.strip().split(',')
                if not row or row[0] == 'Parameter':  # Skip empty rows or new parameter headers
                    continue
                
                site_name = row[site_index]
                
                # Find the corresponding parameter
                # parameter = None
                # for i in range(len(hours_header)):
                #     if hours_header[i] and row[i]:
                #         parameter = hours_header[i]
                #         break
                if row[parameter_index] and len(row[parameter_index])>0 :
                    parameter = row[parameter_index]
                    
                if not parameter:
                    continue
                
                # Process each hour's result
                for hour in range(24):
                    result = row[hour_start_index + hour].strip()
                    if result and len(result)>0:
                        value=result
                        try:
                            qualifier=''
                            if '<=' in value:
                                value = value.replace('<=', '')
                                qualifier="<="
                            if '<' in value:
                                value = value.replace('<', '')
                                qualifier="<"
                            if '>' in value:
                                value = value.replace('>', '')
                                qualifier=">"
                            date_time = date + timedelta(hours=hour)
                            if len(value)>0:
                                value = float(value)
                                
                                transformed_data.append({
                                    'Parameter': parameter,
                                    'Site Name': site_name,
                                    'Date with time': date_time.isoformat(), #('%Y-%m-%d %H:%M'),
                                    'Result': value,
                                    'Qualifier': qualifier,
                                    'Original Value':result
                                })
                            else:
                                transformed_data.append({
                                    'Parameter': parameter,
                                    'Site Name': site_name,
                                    'Date with time': date_time.isoformat(), #('%Y-%m-%d %H:%M'),
                                    'Result': None,
                                    'Qualifier': qualifier,
                                    'Original Value':result
                                })
                        except ValueError:
                            print (f' "{result}" is not a float')
                            transformed_data.append({
                                    'Parameter': parameter,
                                    'Site Name': site_name,
                                    'Date with time': date_time.isoformat(), #('%Y-%m-%d %H:%M'),
                                    'Result': '',
                                    'Qualifier': '',
                                    'Original Value':result
                                })

    # Create DataFrame from transformed data
    output_df = pd.DataFrame(transformed_data)
    return output_df



In [3]:
def files_last_30(base_url='http://jtimmer.digitalspacemail17.net/data/', filepattern='yesterday_$filedate.CSV'):
    today = datetime.now()
    dates = [(today - timedelta(days=i)).strftime('%Y%m%d') for i in range(30)]
    template_string = Template(filepattern)
    filenames = [template_string.safe_substitute(filedate=s) for s in dates]
    filenames.append(current_file)
    urls = [f'{base_url}{f}' for f in filenames]
    return urls


In [4]:
def h2s_guidance(result):
    levels=[{ 'min':0, 'max':5, 'level':"green"},
            { 'min':5, 'max':30, 'level':"yellow"},
            { 'min':30, 'max':27000, 'level':"orange"},
            { 'min':27000, 'max': None, 'level':"purple"}]
    if pd.isna(result) or result == '':
        return 'white'
    else:
        result = float(result)
    for level in levels:
        if level['max'] is None:
            if result > level['min']:
                return level['level']
        elif result > level['min'] and result < level['max']:
            return level['level']

In [5]:
# List of file paths
#file_paths = ['../../../../data/apcd_sd/yesterday_20241203.csv', '../../../../data/apcd_sd/current.csv']

#file_paths = ['../../../../data/apcd_sd/current.csv']


#file_paths = glob.glob('../../../../data/apcd_sd/'+'/yesterday_*.csv')
file_paths = files_last_30()
print (file_paths)
# Process the files
output_df = process_csv_files(file_paths)
output_df['Icon']='circle'


# Save the output to a CSV file
output_df.to_csv('../../../../data/apcd_sd/out/apcd_output__all.csv', index=False)

so2 = output_df[output_df['Parameter'] == so2_parameter]
so2.to_csv('../../../../data/apcd_sd/out/apcd_output__so2_v1.csv', index=False)

h2s = output_df[output_df['Parameter'] == h2s_parameter]
h2s['level']= h2s['Result'].apply(lambda r: h2s_guidance (r))
h2s.to_csv('../../../../data/apcd_sd/out/apcd_output__h2s_v1.csv', index=False)

so2 = output_df[output_df['Parameter'] == so2_parameter]
so2.to_csv('../../../../data/apcd_sd/out/apcd_output__so2_v1.csv', index=False)

# Display the first few rows of the output
output_df.head(100)

['http://jtimmer.digitalspacemail17.net/data/yesterday_20250403.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250402.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250401.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250331.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250330.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250329.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250328.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250327.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250326.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250325.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250324.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250323.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250322.CSV', 'http://jtimmer.digitalspacemail17.net/data/yesterday_20250321.CSV', 'http://jtimmer.digitalspacemail1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  h2s['level']= h2s['Result'].apply(lambda r: h2s_guidance (r))


Unnamed: 0,Parameter,Site Name,Date with time,Result,Qualifier,Original Value,Icon
0,01 OZONE PPM,ALPINE,2025-04-02T00:00:00,0.045,,.045,circle
1,01 OZONE PPM,ALPINE,2025-04-02T01:00:00,0.043,,.043,circle
2,01 OZONE PPM,ALPINE,2025-04-02T02:00:00,,,C,circle
3,01 OZONE PPM,ALPINE,2025-04-02T03:00:00,,,C,circle
4,01 OZONE PPM,ALPINE,2025-04-02T04:00:00,0.04,,.040,circle
...,...,...,...,...,...,...,...
95,01 OZONE PPM,EL CAJON LES,2025-04-02T23:00:00,0.037,,.037,circle
96,01 OZONE PPM,KEARNY MESA,2025-04-02T00:00:00,0.045,,.045,circle
97,01 OZONE PPM,KEARNY MESA,2025-04-02T01:00:00,0.043,,.043,circle
98,01 OZONE PPM,KEARNY MESA,2025-04-02T02:00:00,,,C,circle


In [6]:
locations_df = pd.read_csv(airnow_station_url, sep='|', on_bad_lines='warn')
locations_df = locations_df[['SiteName', 'Latitude', 'Longitude','AgencyName' ]].drop_duplicates([ 'SiteName' ])
gs=gpd.GeoSeries.from_xy(locations_df['Longitude'],locations_df['Latitude'])
locations_gdf= gpd.GeoDataFrame(locations_df, 
    geometry=gs ,
    crs = 'EPSG:4326')
locations_gdf['SiteName'] = locations_gdf['SiteName'].str.upper()




In [7]:
sites_csv= """LongName,SiteName,Latitude,Longitude
San Ysidro Fire Station #29,SAN YSIDRO,  32.552825, -117.047369
Berry Elementary School (BES),NESTOR - BES, 32.567097, -117.090656
Imperial Beach Civic Center (ICF),IB CIVIC CTR, 32.576139,  -117.115361"""
# remove San Ysidro. Already have a row
sites_csv= """LongName,SiteName,Latitude,Longitude
Berry Elementary School (BES),NESTOR - BES, 32.567097, -117.090656
Imperial Beach Civic Center (ICF),IB CIVIC CTR, 32.576139,  -117.115361
El Cajon - Lexington Elementary School,EL CAJON LES, 32.789561,  -116.944222
"""
sites_df = pd.read_csv(StringIO(sites_csv), sep=',', on_bad_lines='warn')
geom = gpd.points_from_xy(sites_df.Longitude, sites_df.Latitude,)
sites_gdf = gpd.GeoDataFrame(sites_df, geometry=geom,crs = 'EPSG:4326')

locations2_gdf = pd.concat([locations_gdf, sites_gdf])
#locations2_gdf = locations_gdf.merge(sites_gdf,suffixes=('', '_y'), how='outer')
locations2_gdf

Unnamed: 0,SiteName,Latitude,Longitude,AgencyName,geometry,LongName
0,ISLAMABAD-OLD,33.723500,73.118220,U.S. Department of State Pakistan - Islamabad,POINT (73.11822 33.7235),
1,,0.000000,0.000000,U.S. Department of State Bosnia Herzegovina - ...,POINT (0 0),
2,ST. JOHN'S,47.560380,-52.711500,Newfoundland & Labrador DEC,POINT (-52.7115 47.56038),
8,CORNERBROOK,48.949400,-58.055600,Newfoundland & Labrador DEC,POINT (-58.0556 48.9494),
11,MOUNT PEARL,47.505130,-52.794800,Newfoundland & Labrador DEC,POINT (-52.7948 47.50513),
...,...,...,...,...,...,...
24272,DURHAM,35.902370,-78.888200,Village Green,POINT (-78.8882 35.90237),
24273,YEREVAN,40.165150,44.481710,U.S. Department of State Armenia - Yerevan,POINT (44.48171 40.16515),
0,NESTOR - BES,32.567097,-117.090656,,POINT (-117.09066 32.5671),Berry Elementary School (BES)
1,IB CIVIC CTR,32.576139,-117.115361,,POINT (-117.11536 32.57614),Imperial Beach Civic Center (ICF)


In [8]:
output_gdf = locations2_gdf.merge(output_df, how='inner',  left_on='SiteName', right_on='Site Name', suffixes=('', '_y'))
output_gdf.head(100)



Unnamed: 0,SiteName,Latitude,Longitude,AgencyName,geometry,LongName,Parameter,Site Name,Date with time,Result,Qualifier,Original Value,Icon
0,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,01 OZONE PPM,CHULA VISTA,2025-04-02T00:00:00,0.047,,.047,circle
1,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,01 OZONE PPM,CHULA VISTA,2025-04-02T01:00:00,0.045,,.045,circle
2,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,01 OZONE PPM,CHULA VISTA,2025-04-02T02:00:00,,,C,circle
3,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,01 OZONE PPM,CHULA VISTA,2025-04-02T03:00:00,,,C,circle
4,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,01 OZONE PPM,CHULA VISTA,2025-04-02T04:00:00,0.044,,.044,circle
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,11 PM2.5 µg/M3,CHULA VISTA,2025-04-02T23:00:00,8.3,,8.3,circle
96,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,12 VWDR °,CHULA VISTA,2025-04-02T00:00:00,286.0,,286,circle
97,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,12 VWDR °,CHULA VISTA,2025-04-02T01:00:00,285.0,,285,circle
98,CHULA VISTA,32.631229,-117.059074,San Diego APCD,POINT (-117.05907 32.63123),,12 VWDR °,CHULA VISTA,2025-04-02T02:00:00,293.0,,293,circle


In [9]:
output_gdf.to_csv('../../../../data/apcd_sd/out/apcd_output__all.csv', index=False)

so2 = output_gdf[output_gdf['Parameter'] == so2_parameter]
so2.to_csv('../../../../data/apcd_sd/out/apcd_output__so2.csv', index=False)

h2s = output_gdf[output_gdf['Parameter'] == h2s_parameter]
h2s.to_csv('../../../../data/apcd_sd/out/apcd_output__h2s.csv', index=False)


In [10]:
from foursquare.data_sdk import DataSDK, MediaType
import os
refresh_token=os.getenv("RC_FSQ_REFRESH_TOKEN")
client = DataSDK()

In [11]:
client.upload_dataframe(
            h2s ,
            dataset="829a3d09-626a-47c4-b028-49eb4b1be507",
            name='H2S',
            description="APCD H2S from  dataframe")

client.upload_dataframe(
            so2,
          
    dataset="c8594fe9-82bb-45cb-ab1b-05d8401874d4",
            name='S02',
            description="APCD S02 from  dataframe")

client.upload_dataframe(
            output_gdf,
           dataset="0a3cdf55-bd41-4b10-a04b-cc69e9e78ce9",
            name='APCD Air Quality',
            description="APCD Air Quality from  dataframe")

Compressing file
Uploading: 100%|██████████| 12.1k/12.1k [00:00<00:00, 14.6kB/s]
Compressing file
Uploading: 100%|██████████| 4.67k/4.67k [00:00<00:00, 9.88kB/s]
Compressing file
Uploading: 100%|██████████| 359k/359k [00:00<00:00, 400kB/s] 


Dataset(id=UUID('0a3cdf55-bd41-4b10-a04b-cc69e9e78ce9'), name='APCD Air Quality', type=<DatasetType.MANAGED: 'managed'>, created_at=datetime.datetime(2025, 2, 13, 17, 35, 55, tzinfo=TzInfo(UTC)), updated_at=datetime.datetime(2025, 4, 3, 22, 22, 8, tzinfo=TzInfo(UTC)), description='APCD Air Quality from  dataframe', is_valid=True, data_connector=None, metadata=DatasetMetadata(media_type='text/csv', size=5798501, source=None, tileset_data_url=None, tileset_metadata_url=None, image_url=None, metadata_url=None, data_status=None))

Lookin at altest

In [5]:
lastest_base_df = process_csv_files([f'{base_url}{current_file}'])

 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "M" is not a float
 "M" is not a float
 "M" is not a float
 "M" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "M" is not a float
 "M" is not a float
 "M" is not a float
 "M" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float
 "C" is not a float


In [11]:
latest_h2s_df = lastest_base_df[lastest_base_df['Parameter'] == h2s_parameter]
latest_h2s_df['level'] = latest_h2s_df['Result'].apply(lambda r: h2s_guidance(r))
latest_h2s_df.groupby(['Parameter', 'Site Name', ], as_index=False).tail(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_h2s_df['level'] = latest_h2s_df['Result'].apply(lambda r: h2s_guidance(r))


Unnamed: 0,Parameter,Site Name,Date with time,Result,Qualifier,Original Value,level
246,07 H2S PPB,IB CIVIC CTR,2025-03-25T12:00:00,0.9,,0.9,green
259,07 H2S PPB,NESTOR - BES,2025-03-25T12:00:00,1.0,,1.0,green
