### 1. Getting Started

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from google.cloud import bigquery

%config InlineBackend.figure_format = 'retina'
%load_ext google.cloud.bigquery

#### 1-1. Functions

In [2]:
def view_tables(dataset):
    ## Lists the tables in the dataset
    tables = list(client.list_tables(dataset))
    for table in tables: print(table.table_id)

def get_schema(table):
    ## Retreives the table schema as a printed object
    return(client.get_table(table).schema)

def query_to_df(query):
    ## View query results as a pandas dataframe
    return(client.query(query).result().to_dataframe())

def view_dataframe(table, n):
    ## View the first n rows of table as a pandas dataframe
    return(client.list_rows(table, max_results=n).to_dataframe())

In [3]:
## File with authentication key info
JSON_FILE_NAME='####.json'

## Client
client = bigquery.Client.from_service_account_json(JSON_FILE_NAME)

In [4]:
## View dataset tables 
dataset = 'bigquery-public-data.openaq'
view_tables(dataset)

global_air_quality


### 2. View Table Schema

In [5]:
## View table schema
table = 'bigquery-public-data.openaq.global_air_quality'
get_schema(table)

[SchemaField('location', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('city', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('country', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('pollutant', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('value', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('unit', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('source_name', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('averaged_over_in_hours', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('location_geom', 'GEOGRAPHY', 'NULLABLE', None, (), None)]

In [6]:
## View first n rows of table
print(f"> Table: {table[35:]}")
view_dataframe(table, 3)

> Table: air_quality


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Płock, ul. Reja",Płock,PL,bc,0.3,2022-05-09 18:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
1,"Legnica, al. Rzeczypospolitej",Legnica,PL,bc,0.67909,2022-05-16 05:00:00+00:00,µg/m³,GIOS,1.0,51.204503,16.180513,POINT(51.204503 1)
2,"Włocławek, ul. Okrzei",Włocławek,PL,bc,2.64,2022-04-29 06:00:00+00:00,µg/m³,GIOS,1.0,52.658467,19.059314,POINT(52.658467 1)


### 3. SELECT, FROM & WHERE

#### 3-1. Query: Select items from the city column where the country column is equal to 'US'

In [7]:
query = ("""
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """)

## Query result as pandas DataFrame
query_to_df(query).sample(3)

Unnamed: 0,city
1050437,Salt Lake City
404109,Vallejo-Fairfield
941883,Columbus


In [8]:
query_to_df(query)['city'].value_counts().head(3)

Phoenix-Mesa-Scottsdale             39414
Los Angeles-Long Beach-Santa Ana    27479
Riverside-San Bernardino-Ontario    26887
Name: city, dtype: int64

In [9]:
print(F"Number of cities: {query_to_df(query)['city'].nunique()}")

Number of cities: 826


#### 3-2. Query:

In [10]:
query = ("""
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """)

## Query result as pandas DataFrame
query_to_df(query).sample(3)

Unnamed: 0,city,country
327327,UINTAH,US
703535,Charleston-North Charleston,US
206375,Athens,US


#### 3-3. Query:

In [11]:
query = ("""
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """)

## Query result as pandas DataFrame
query_to_df(query).head(3)

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,Seattle-10th & Welle,Seattle-Tacoma-Bellevue,US,bc,0.6,2022-04-09 03:00:00+00:00,µg/m³,AirNow,1.0,47.597222,-122.319722,POINT(47.597222 1)
1,E Providence,Providence-New Bedford-Fall River,US,bc,0.18,2022-05-07 01:00:00+00:00,µg/m³,AirNow,1.0,41.840302,-71.361702,POINT(41.840302 1)
2,Portland Humboldt Sc,Portland-Vancouver-Beaverton,US,bc,0.06,2022-05-07 12:00:00+00:00,µg/m³,AirNow,1.0,45.558081,-122.670985,POINT(45.558081 1)


#### 3-4. Query:

In [12]:
query = ("""
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        LIMIT 3
        """)

## Query result as pandas DataFrame
query_to_df(query)

Unnamed: 0,score,title
0,1,Work at Socialcam to become the master of iOS ...
1,1,Go in Production? Streak (YC S11) hiring Go en...
2,1,Interviewstreet (YC S11) looking for marketing...
