In [1]:
import pandas as pd
import numpy as np

In [5]:
cities = ['New York', 'Jacksonville', 'Dallas', 'San Antonio', 'San Diego', 'Houston', 'Phoenix', 'Philadelphia', 'Los Angeles', 'Seattle']

In [2]:
AirQuality = pd.read_csv('/Users/rileychisholm/Downloads/US_AQI.csv')

In [3]:
AirQuality

Unnamed: 0.1,Unnamed: 0,CBSA Code,Date,AQI,Category,Defining Parameter,Number of Sites Reporting,city_ascii,state_id,state_name,lat,lng,population,density,timezone
0,0,10140,2022-01-01,21,Good,PM2.5,2,Aberdeen,WA,Washington,46.9757,-123.8094,16571.0,588.0,America/Los_Angeles
1,1,10140,2022-01-02,12,Good,PM2.5,2,Aberdeen,WA,Washington,46.9757,-123.8094,16571.0,588.0,America/Los_Angeles
2,2,10140,2022-01-03,18,Good,PM2.5,2,Aberdeen,WA,Washington,46.9757,-123.8094,16571.0,588.0,America/Los_Angeles
3,3,10140,2022-01-04,19,Good,PM2.5,2,Aberdeen,WA,Washington,46.9757,-123.8094,16571.0,588.0,America/Los_Angeles
4,4,10140,2022-01-05,17,Good,PM2.5,2,Aberdeen,WA,Washington,46.9757,-123.8094,16571.0,588.0,America/Los_Angeles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5617320,5718366,49740,1980-12-27,52,Moderate,CO,1,Yuma,AZ,Arizona,32.5995,-114.5491,137612.0,311.0,America/Phoenix
5617321,5718367,49740,1980-12-28,52,Moderate,CO,1,Yuma,AZ,Arizona,32.5995,-114.5491,137612.0,311.0,America/Phoenix
5617322,5718368,49740,1980-12-29,24,Good,CO,1,Yuma,AZ,Arizona,32.5995,-114.5491,137612.0,311.0,America/Phoenix
5617323,5718369,49740,1980-12-30,14,Good,CO,1,Yuma,AZ,Arizona,32.5995,-114.5491,137612.0,311.0,America/Phoenix


In [4]:
# Drops unnecessary columns
columns_to_drop = ['Unnamed: 0', 'density', 'population', 'CBSA Code', 'Number of Sites Reporting', 'state_name', 'lat', 'lng', 'timezone']
AirQuality.drop(columns=columns_to_drop, inplace=True)


In [6]:
# Renames columns
AirQuality.rename(columns={
    'city_ascii': 'City',
    'Defining Parameter': 'Main Pollutant'
}, inplace=True)


In [31]:
# Filters the dataset to include only the specified cities
AirQ = AirQuality[AirQuality['City'].isin(cities)]

In [32]:
# Checks for missing values
missing_values = AirQ.isnull().sum()
print(missing_values)

Date              0
AQI               0
Category          0
Main Pollutant    0
City              0
state_id          0
density           0
dtype: int64


In [33]:
AirQ

Unnamed: 0,Date,AQI,Category,Main Pollutant,City,state_id,density
6001,2022-01-01,53,Moderate,PM2.5,Dallas,TX,1522.0
6002,2022-01-02,34,Good,Ozone,Dallas,TX,1522.0
6003,2022-01-03,38,Good,Ozone,Dallas,TX,1522.0
6004,2022-01-04,41,Good,Ozone,Dallas,TX,1522.0
6005,2022-01-05,41,Good,NO2,Dallas,TX,1522.0
...,...,...,...,...,...,...,...
5603470,1980-12-27,52,Moderate,CO,Seattle,WA,3414.0
5603471,1980-12-28,76,Moderate,CO,Seattle,WA,3414.0
5603472,1980-12-29,97,Moderate,CO,Seattle,WA,3414.0
5603473,1980-12-30,101,Unhealthy for Sensitive Groups,CO,Seattle,WA,3414.0


In [34]:
AirQ.info()

<class 'pandas.core.frame.DataFrame'>
Index: 155729 entries, 6001 to 5603474
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Date            155729 non-null  object 
 1   AQI             155729 non-null  int64  
 2   Category        155729 non-null  object 
 3   Main Pollutant  155729 non-null  object 
 4   City            155729 non-null  object 
 5   state_id        155729 non-null  object 
 6   density         155729 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 9.5+ MB


In [35]:
AirQ['Category'].value_counts()

Category
Moderate                          68117
Good                              56704
Unhealthy for Sensitive Groups    18610
Unhealthy                          7552
Very Unhealthy                     4598
Hazardous                           148
Name: count, dtype: int64

In [36]:
AirQ['Main Pollutant'].value_counts()

Main Pollutant
Ozone    72763
PM2.5    37436
NO2      23940
CO       15376
PM10      6214
Name: count, dtype: int64

In [37]:
# Converts the 'Date' column to datetime
AirQ['Date'] = pd.to_datetime(AirQ['Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AirQ['Date'] = pd.to_datetime(AirQ['Date'])


In [38]:
# Sets the 'Date' column as the index
AirQ.set_index('Date', inplace=True)

In [40]:
def mode(series):
    return series.mode()[0]

In [41]:
# Groups by City and resamples by month
monthly_data = AirQ.groupby('City').resample('M').agg({
    'AQI': 'mean',
    'Category': lambda x: x.mode()[0],  
    'Main Pollutant': lambda x: x.mode()[0],  
    'state_id': 'first',  
    'density': 'first' 
}).reset_index()

In [42]:
# Reformats 'Date' column
monthly_data['Date'] = monthly_data['Date'].dt.to_period('M').dt.to_timestamp()

In [43]:
monthly_data

Unnamed: 0,City,Date,AQI,Category,Main Pollutant,state_id,density
0,Dallas,1980-01-01,38.258065,Good,NO2,TX,1522.0
1,Dallas,1980-02-01,53.344828,Good,Ozone,TX,1522.0
2,Dallas,1980-03-01,73.548387,Moderate,Ozone,TX,1522.0
3,Dallas,1980-04-01,99.533333,Unhealthy for Sensitive Groups,Ozone,TX,1522.0
4,Dallas,1980-05-01,101.096774,Unhealthy for Sensitive Groups,Ozone,TX,1522.0
...,...,...,...,...,...,...,...
5066,Seattle,2021-10-01,43.774194,Good,Ozone,WA,3414.0
5067,Seattle,2021-11-01,43.966667,Good,PM2.5,WA,3414.0
5068,Seattle,2021-12-01,50.096774,Good,PM2.5,WA,3414.0
5069,Seattle,2022-01-01,58.870968,Moderate,PM2.5,WA,3414.0


In [44]:
monthly_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5071 entries, 0 to 5070
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   City            5071 non-null   object        
 1   Date            5071 non-null   datetime64[ns]
 2   AQI             5071 non-null   float64       
 3   Category        5071 non-null   object        
 4   Main Pollutant  5071 non-null   object        
 5   state_id        5071 non-null   object        
 6   density         5071 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 277.4+ KB


In [45]:
# Checks for missing values
monthly_missing_values = monthly_data.isnull().sum()
print(monthly_missing_values)

City              0
Date              0
AQI               0
Category          0
Main Pollutant    0
state_id          0
density           0
dtype: int64


In [46]:
monthly_data['Category'].value_counts()

Category
Moderate                          2611
Good                              1930
Unhealthy for Sensitive Groups     298
Very Unhealthy                     119
Unhealthy                          112
Hazardous                            1
Name: count, dtype: int64

In [47]:
monthly_data['City'].value_counts()

City
New York        509
San Diego       509
Houston         508
Phoenix         508
Jacksonville    507
Philadelphia    507
San Antonio     507
Dallas          506
Seattle         506
Los Angeles     504
Name: count, dtype: int64

In [48]:
monthly_data['Main Pollutant'].value_counts()

Main Pollutant
Ozone    2611
PM2.5    1227
NO2       627
CO        501
PM10      105
Name: count, dtype: int64

In [49]:
# Save to a new CSV file
# monthly_data.to_csv('/Users/rileychisholm/Downloads/US_AQI_monthly.csv', index=False)