### 1. Getting Started

In [1]:
## Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from google.cloud import bigquery

%config InlineBackend.figure_format = 'retina'
%load_ext google.cloud.bigquery

## FUNCTIONS
def view_tables(dataset):
    tables = list(client.list_tables(dataset))
    for table in tables: print(table.table_id)

def get_schema(table):
    return(client.get_table(table).schema)

def query_to_df(query):
    return(client.query(query).result().to_dataframe())

def view_dataframe(table, n):
    return(client.list_rows(table, max_results=n).to_dataframe())

In [2]:
## File with authentication key info
JSON_FILE_NAME='####.json'

## Client
client = bigquery.Client.from_service_account_json(JSON_FILE_NAME)

### 2. Retrieve Data

In [3]:
## View dataset tables 
dataset = 'bigquery-public-data.hacker_news'
view_tables(dataset)

comments
full
full_201510
stories


In [4]:
## View schema 'comments' table
table='bigquery-public-data.hacker_news.comments'
get_schema(table)

[SchemaField('id', 'INTEGER', 'NULLABLE', 'Unique comment ID', (), None),
 SchemaField('by', 'STRING', 'NULLABLE', 'Username of commenter', (), None),
 SchemaField('author', 'STRING', 'NULLABLE', 'Username of author', (), None),
 SchemaField('time', 'INTEGER', 'NULLABLE', 'Unix time', (), None),
 SchemaField('time_ts', 'TIMESTAMP', 'NULLABLE', 'Human readable time in UTC (format: YYYY-MM-DD hh:mm:ss)', (), None),
 SchemaField('text', 'STRING', 'NULLABLE', 'Comment text', (), None),
 SchemaField('parent', 'INTEGER', 'NULLABLE', 'Parent comment ID', (), None),
 SchemaField('deleted', 'BOOLEAN', 'NULLABLE', 'Is deleted?', (), None),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', 'Is dead?', (), None),
 SchemaField('ranking', 'INTEGER', 'NULLABLE', 'Comment ranking', (), None)]

In [5]:
## Table: comments
table='bigquery-public-data.hacker_news.comments'
view_dataframe(table, 5)

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0


### 3. GROUP BY, HAVING & COUNT

#### 3-1. Query: Select comments that received 10+ replies

- parent: Parent comment ID
- id: Unique comment ID

In [6]:
## Select comments that received 10+ replies
query = """
        SELECT parent, COUNT(id) AS number_of_posts
        FROM `bigquery-public-data.hacker_news.comments`
        GROUP BY parent
        HAVING COUNT(id) > 10
        """

query_to_df(query).head(3)

Unnamed: 0,parent,number_of_posts
0,7536283,45
1,4053076,242
2,2530963,59


In [7]:
## Total number of entries in comments table
query = """
        SELECT COUNT(*)
        FROM `bigquery-public-data.hacker_news.comments`
        """

query_to_df(query)

Unnamed: 0,f0_
0,8399417
