In [2]:
from google.cloud import bigquery
import pandas as pd

In [3]:
client = bigquery.Client()

In [5]:
data_ref = client.dataset("openaq", project='bigquery-public-data')
data = client.get_dataset(data_ref)

tables = list(client.list_tables(data_ref))
for t in tables:
    print(t.table_id)

global_air_quality


In [6]:
table_ref = data_ref.table('global_air_quality')
table = client.get_table(table_ref)

In [7]:
table.num_rows

23086

In [14]:
query = """
    SELECT *
    FROM `bigquery-public-data.openaq.global_air_quality`
    WHERE country='US'
    """

In [17]:
TWENTY_MB = 20*1000*1000
dry_run_config = bigquery.QueryJobConfig(dry_run=True)
dry_run_query_job = client.query(query, job_config=dry_run_config)
print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 2215620 bytes.


In [20]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=TWENTY_MB)

safe_query_job = client.query(query, job_config=safe_config)
job_post_poll = safe_query_job.to_dataframe()

In [19]:
table.schema

[SchemaField('location', 'STRING', 'NULLABLE', 'Location where data was measured', ()),
 SchemaField('city', 'STRING', 'NULLABLE', 'City containing location', ()),
 SchemaField('country', 'STRING', 'NULLABLE', 'Country containing measurement in 2 letter ISO code', ()),
 SchemaField('pollutant', 'STRING', 'NULLABLE', 'Name of the Pollutant being measured. Allowed values: PM25, PM10, SO2, NO2, O3, CO, BC', ()),
 SchemaField('value', 'FLOAT', 'NULLABLE', 'Latest measured value for the pollutant', ()),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'The datetime at which the pollutant was measured, in ISO 8601 format', ()),
 SchemaField('unit', 'STRING', 'NULLABLE', 'The unit the value was measured in coded by UCUM Code', ()),
 SchemaField('source_name', 'STRING', 'NULLABLE', 'Name of the source of the data', ()),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', 'Latitude in decimal degrees. Precision >3 decimal points.', ()),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', 'Longitude in d

In [21]:
len(job_post_poll)

3709

In [23]:
job_post_poll.value.mean()

3.6840586411431837

In [24]:
job_post_poll.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,Portsmouth,Boston-Cambridge-Quincy,US,no2,7.0,2020-02-26 13:00:00+00:00,µg/m³,DEFRA,43.07537,-70.74802,1.0
1,Aberdeen,BROWN,US,no2,13.0,2020-03-16 15:00:00+00:00,µg/m³,DEFRA,45.468597,-98.494064,1.0
2,Aberdeen,BROWN,US,o3,39.0,2020-03-16 17:00:00+00:00,µg/m³,DEFRA,45.468597,-98.494064,8.0
3,Freeport South Avenue I C1012,Houston,US,so2,0.0001,2016-03-06 18:00:00+00:00,ppm,Texas,28.96443,-95.35483,1.0
4,Lake Jackson C1016,Houston,US,no2,-0.0004,2016-03-06 18:00:00+00:00,ppm,Texas,29.043758,-95.472946,1.0


In [25]:
## exercise

In [28]:
query = """
        SELECT DISTINCT country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE unit = 'ppm'
        """

safe_query_job = client.query(query, job_config=safe_config)
new_df = safe_query_job.to_dataframe()

In [29]:
len(new_df)

21

In [30]:
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE value = 0
        """

safe_query_job = client.query(query, job_config=safe_config)
new_df = safe_query_job.to_dataframe()

In [31]:
len(new_df)

854

In [32]:
new_df.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,Victoria Memorial - WBSPCB,Kolkata,IN,pm25,0.0,2017-10-16 20:45:00+00:00,µg/m³,CPCB,22.572645,88.36389,0.25
1,"Rabindra Bharati University, Kolkata - WBSPCB",Kolkata,IN,so2,0.0,2017-10-28 14:30:00+00:00,µg/m³,CPCB,22.627874,88.3804,0.25
2,"Końskie, MOBILNA",Końskie,PL,pm10,0.0,2018-12-21 13:00:00+00:00,µg/m³,GIOS,51.189526,20.408892,
3,"Końskie, MOBILNA",Końskie,PL,pm25,0.0,2018-12-21 13:00:00+00:00,µg/m³,GIOS,51.189526,20.408892,
4,NałęczówMOB,Nałęczów,PL,bc,0.0,2019-08-27 04:00:00+00:00,µg/m³,GIOS,51.28493,22.210241,


In [33]:
new_df.country.value_counts()

US    292
CL    116
ES     86
PT     55
IN     49
AU     48
TR     29
AT     24
FR     23
CA     22
TH     15
BE     14
IL     14
MX      9
DE      7
IT      6
PE      6
PL      5
GB      5
DK      5
RU      3
BR      3
BA      2
AR      2
IS      2
NO      2
CN      2
KE      2
HU      1
MT      1
FI      1
MK      1
MN      1
AD      1
Name: country, dtype: int64