In [4]:
from google.cloud import bigquery

In [5]:
client = bigquery.Client()

Using Kaggle's public dataset BigQuery integration.


In [6]:
dataset_ref = client.dataset('hacker_news', project = 'bigquery-public-data')
dataset = client.get_dataset(dataset_ref)

In [7]:
tables = list(client.list_tables(dataset))
for i in tables:
    print(i.table_id)

comments
full
full_201510
stories


In [8]:
# we will need twi tables
### Comments and stories
table_ref = dataset_ref.table('comments')
table = client.get_table(table_ref)

In [9]:
# view the top rows of table
client.list_rows(table, max_results=5).to_dataframe()
# this is comments table

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0


In [10]:
# same like this show the stories table
table_ref = dataset_ref.table('stories')
table = client.get_table(table_ref)
client.list_rows(table, max_results = 5).to_dataframe()
# this is stories table

Unnamed: 0,id,by,score,time,time_ts,title,url,text,deleted,dead,descendants,author
0,6940813,sarath237,0,1387536270,2013-12-20 10:44:30+00:00,Sheryl Brindo Hot Pics,http://www.youtube.com/watch?v=ym1cyxneB0Y,Sheryl Brindo Hot Pics,,True,,sarath237
1,6991401,123123321321,0,1388508751,2013-12-31 16:52:31+00:00,Are you people also put off by the culture of ...,,They&#x27;re pretty explicitly &#x27;startup f...,,True,,123123321321
2,1531556,ssn,0,1279617234,2010-07-20 09:13:54+00:00,New UI for Google Image Search,http://googlesystem.blogspot.com/2010/07/googl...,Again following on Bing's lead.,,,0.0,ssn
3,5012398,hoju,0,1357387877,2013-01-05 12:11:17+00:00,Historic website screenshots,http://webscraping.com/blog/Generate-website-s...,Python script to generate historic screenshots...,,,0.0,hoju
4,7214182,kogir,0,1401561740,2014-05-31 18:42:20+00:00,Placeholder,,Mind the gap.,,,0.0,kogir


# Question

## [1]. Pull information from the stories and comments tables to create a table showing all stories posted on January 1, 2012, along with the corresponding number of comments. We use a LEFT JOIN so that the results include stories that didn't receive any comments.

In [11]:
### Approach
##### If someone has commented then its is 100% sure that he has commented on someones story
#### but it is also possible that a story does not have any comment
#### Therfore we will have 'stories' as our left table and 'comments' as our right table

In [18]:
# to make your query run faster try to use your right table in left join in CTE
# and vice versa

query = """
             WITH c AS
             (
             SELECT parent, COUNT(*) as num_comments
             FROM `bigquery-public-data.hacker_news.comments` 
             GROUP BY parent
             )
             
             SELECT s.id as story_id, s.by, s.title, c.num_comments
             FROM `bigquery-public-data.hacker_news.stories` AS s
             LEFT JOIN c
             ON s.id = c.parent
             WHERE EXTRACT(DATE FROM s.time_ts) = '2012-01-01'
             ORDER BY c.num_comments DESC
    
        """

join_result = client.query(query).result().to_dataframe()
join_result.head()

Unnamed: 0,story_id,by,title,num_comments
0,3412900,whoishiring,Ask HN: Who is Hiring? (January 2012),154.0
1,3412901,whoishiring,Ask HN: Freelancer? Seeking freelancer? (Janua...,97.0
2,3412643,jemeshsu,Avoid Apress,30.0
3,3414012,ramanujam,Impress.js - a Prezi like implementation using...,27.0
4,3412891,Brajeshwar,"There's no shame in code that is simply ""good ...",27.0


### Since the results are ordered by the num_comments column, stories without comments appear at the end of the DataFrame. (Remember that NaN stands for "not a number".)

In [19]:
join_result.tail()

Unnamed: 0,story_id,by,title,num_comments
439,3413432,unixroot,Chinese Government taking strong step against ...,
440,3412892,asjustas,Ask HN: What think about time planing?,
441,3412786,halleyrobinsons,GoldforCash,
442,3413524,daedalus2027,Compiling Doom3 in powerpc (in spanish),
443,3412297,clb22,AndesBeat first month metrics & best news of L...,


## [2]. Write a query to select all usernames corresponding to users who wrote stories or comments on January 1, 2014. We use UNION DISTINCT (instead of UNION ALL) to ensure that each user appears in the table at most once.

In [20]:
union_query = """
              SELECT c.by
              FROM `bigquery-public-data.hacker_news.comments` AS c
              WHERE EXTRACT(DATE FROM c.time_ts) = '2014-01-01'
              UNION DISTINCT
              SELECT s.by
              FROM `bigquery-public-data.hacker_news.stories` AS s
              WHERE EXTRACT(DATE FROM s.time_ts) = '2014-01-01'
              """

# Run the query, and return a pandas DataFrame
union_result = client.query(union_query).result().to_dataframe()
union_result.head()

Unnamed: 0,by
0,vitd
1,gpvos
2,mbell
3,znowi
4,GSimon


In [21]:
# To get the number of users who posted on January 1, 2014, we need only take the length of the DataFrame.

len(union_result)

2282