In [1]:
# importing the bigquery library from google.cloud
from google.cloud import bigquery

In [2]:
# creating a client object
# remeber this client object will help us in fetching the data
client = bigquery.Client()

Using Kaggle's public dataset BigQuery integration.


In [3]:
# now that the client in initialised
# lets import the data from 'bigquery-public-data' project 
# we will be using openaq dataset to perform some sql operations
# remeber that 'bigquery-public-data' is the project under which 'openaq' (data regarding air quality) dataset is present
dataset_ref = client.dataset('openaq', project = 'bigquery-public-data')

In [5]:
# taking the table reference in our variable
dataset = client.get_dataset(dataset_ref)

In [10]:
# now we will see the tables in our data
tables = list(client.list_tables(dataset))

for i in tables:
    print(i.table_id)

global_air_quality


In [11]:
# so we have one table present in our 'openaq' database

In [14]:
# now we want to fetch the information of the table 'global_air_quality'
# for that we will create the table reference and then we will fetch the information from that reference into our variable

table_ref = dataset_ref.table('global_air_quality')

table = client.get_table(table_ref)

In [16]:
table.schema

[SchemaField('location', 'STRING', 'NULLABLE', 'Location where data was measured', ()),
 SchemaField('city', 'STRING', 'NULLABLE', 'City containing location', ()),
 SchemaField('country', 'STRING', 'NULLABLE', 'Country containing measurement in 2 letter ISO code', ()),
 SchemaField('pollutant', 'STRING', 'NULLABLE', 'Name of the Pollutant being measured. Allowed values: PM25, PM10, SO2, NO2, O3, CO, BC', ()),
 SchemaField('value', 'FLOAT', 'NULLABLE', 'Latest measured value for the pollutant', ()),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'The datetime at which the pollutant was measured, in ISO 8601 format', ()),
 SchemaField('unit', 'STRING', 'NULLABLE', 'The unit the value was measured in coded by UCUM Code', ()),
 SchemaField('source_name', 'STRING', 'NULLABLE', 'Name of the source of the data', ()),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', 'Latitude in decimal degrees. Precision >3 decimal points.', ()),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', 'Longitude in d

In [19]:
# printing the result in table 'global_air_quality'

client.list_rows(table, max_results = 5).to_dataframe()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,co,910.0,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
1,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,no2,131.87,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
2,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,o3,15.57,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
3,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,pm25,45.62,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
4,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,so2,4.49,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25


## Use of BigQuery with SQL

In [20]:
# lets suppose we want to fecth the value column from 'global_air_quality' table of 'openaq' database which is present in 
# 'bigquery-public-data'

query = """ select value from `bigquery-public-data.openaq.global_air_quality` where city = 'Bengaluru' """

### Submitting the Query to the dataset

In [22]:
# asking our client to take the query
query_job = client.query(query)

In [23]:
# storing the result into our variable
values = query_job.to_dataframe()

In [26]:
values.head()

Unnamed: 0,value
0,910.0
1,131.87
2,15.57
3,45.62
4,4.49


In [27]:
# here we can see that how we can use our pandas function along with the sql
values_greater_than_50 = values[values['value'] >= 50]

In [28]:
values_greater_than_50

Unnamed: 0,value
0,910.0
1,131.87
5,840.0
6,166.55
10,1510.0
12,218.44
14,330.0
16,64.75
18,970.0
22,620.0


In [47]:
query = """ select city from `bigquery-public-data.openaq.global_air_quality` where country = 'IN' """

In [48]:
query_job = client.query(query)

In [49]:
cities = query_job.to_dataframe()

In [54]:
cities['city'].value_counts().head(10)

Delhi        247
Bengaluru     44
Kolkata       41
Patna         39
Faridabad     30
Lucknow       29
Jaipur        24
Mumbai        24
Noida         23
Hyderabad     21
Name: city, dtype: int64

### Selecting whole data from writing SQL query and then treating it as fetched from pandas

In [55]:
query = """select * from `bigquery-public-data.openaq.global_air_quality` """

In [56]:
# submitting the query to the dataset
query_job = client.query(query)

In [57]:
df = query_job.to_dataframe()

In [58]:
df.shape

(21249, 11)

In [59]:
df.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours
0,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,co,910.0,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
1,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,no2,131.87,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
2,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,o3,15.57,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
3,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,pm25,45.62,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25
4,"BTM Layout, Bengaluru - KSPCB",Bengaluru,IN,so2,4.49,2018-02-22 03:00:00+00:00,µg/m³,CPCB,12.912811,77.60922,0.25


In [60]:
df.describe()

Unnamed: 0,value,latitude,longitude,averaged_over_in_hours
count,21249.0,21249.0,21249.0,19821.0
mean,72.954365,36.787785,-0.570696,1.604831
std,6178.069224,20.290061,64.390133,3.995803
min,-335582.0,-72.0117,-161.767,0.25
25%,1.5,32.81148,-16.883057,1.0
50%,9.0,42.006,7.457463,1.0
75%,28.762,48.67807,20.790007,1.0
max,825736.0,78.90669,153.402,59.0
