In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# SQL using BigQuery

In [2]:
from google.cloud import bigquery

### The first step in the workflow is to create a Client object. As you'll soon see, this Client object will play a central role in retrieving information from BigQuery datasets.

In [3]:
client = bigquery.Client()

Using Kaggle's public dataset BigQuery integration.


In [4]:
# in bighquery each dataset is contained in soe project
# for example "Hacker_news" data set is contained in "bigquery-public-data"
# we will tell our client to get the data from the project by using the dataset method and we will store it in our variable
dataset_ref = client.dataset("hacker_news", project = "bigquery-public-data")

In [5]:
# now that our client has fulfilled our request in bringing the dataset from projct to the dataset referece
# now our client is sitting ideal, so we will request our client to pull the data from the reference to our dataset variable
# using get_dataset() method

dataset = client.get_dataset(dataset_ref)

In [6]:
# now our data is loaded into dataset variable
# every dataset is the collection of tables, think it as a spreadsheet file containing multiple tables all composed of 
# rows and columns.

In [7]:
# list all the tables present in the dataset usint list_tables()  method
# remember our client is sitting ideal so we will again order him to list the tables
tables = list(client.list_tables(dataset))

In [8]:
# to print the table id present
for i in tables:
    print(i.table_id)

comments
full
full_201510
stories


In [9]:
# Similar to how we fetched a dataset, we can fetch a table. In the code cell below, we fetch the full table in 
# the hacker_news dataset.
# remember that our client will bring every information from the reference dataset named "dataset_ref" as it has the 
# reference to the original dataset present in the "bigquery-public-dataset"
table_ref = dataset_ref.table('full')

# now fetch the table from the reference 
table = client.get_table(table_ref)

In [10]:
# to view the schema of the table 'full'
table.schema

[SchemaField('title', 'STRING', 'NULLABLE', 'Story title', ()),
 SchemaField('url', 'STRING', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'STRING', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', 'Is dead?', ()),
 SchemaField('by', 'STRING', 'NULLABLE', "The username of the item's author.", ()),
 SchemaField('score', 'INTEGER', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'INTEGER', 'NULLABLE', 'Unix time', ()),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp for the unix time', ()),
 SchemaField('type', 'STRING', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('id', 'INTEGER', 'NULLABLE', "The item's unique id.", ()),
 SchemaField('parent', 'INTEGER', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('descendants', 'INTEGER', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('ranking', 'INTEGER', 'NULLABLE', 'Comment ranking', ()),
 SchemaField(

*Each SchemaField tells us about a specific column (which we also refer to as a field). In order, the information is:*

* The name of the column
* The field type (or datatype) in the column
* The mode of the column ('NULLABLE' means that a column allows NULL values, and is the default)
* A description of the data in that column
* The first field has the SchemaField:

SchemaField('by', 'string', 'NULLABLE', "The username of the item's author.",())

This tells us:

* the field (or column) is called by,
* the data in this field is strings,
* NULL values are allowed, and
* it contains the usernames corresponding to each item's author.
* We can use the list_rows() method to check just the first five lines of of the full table to make sure this is right. (Sometimes databases have outdated descriptions, so it's good to check.) This returns a BigQuery RowIterator object that can quickly be converted to a pandas DataFrame with the to_dataframe() method

In [11]:
# We can use the list_rows() method to check just the first five lines of of the full table to make sure this is right.
# (Sometimes databases have outdated descriptions, so it's good to check.) 
# This returns a BigQuery RowIterator object that can quickly be converted to a pandas DataFrame with the to_dataframe() method.

In [12]:
client.list_rows(table, max_results = 5).to_dataframe()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"My personal red-flag phrase is ""You just need ...",,praptak,,1289079028,2010-11-06 21:30:28+00:00,comment,1877808,1877438.0,,,
1,Ask HN: Explain why apps like FB/Shopify are m...,,I have not worked for large internet companies...,,svrma,3.0,1592775021,2020-06-21 21:30:21+00:00,story,23595571,,2.0,,
2,,,Probably. It&#x27;s not like he is top rate ta...,,paulie_a,,1522785429,2018-04-03 19:57:09+00:00,comment,16748485,16747894.0,,,
3,,,I interpret mastery not by knowing its complet...,,tel,,1297450965,2011-02-11 19:02:45+00:00,comment,2207674,2207537.0,,,
4,,,"In context, he's saying that to anyone justify...",,bmelton,,1324177321,2011-12-18 03:02:01+00:00,comment,3365760,3365750.0,,,


In [13]:
# Preview the first five entries in the "by" column of the "full" table
# we can also select a particular colums to look its values
# here we are using slicing to display only the 0th column as upper index is excluded in python list slicing
client.list_rows(table, selected_fields=table.schema[:1], max_results=5).to_dataframe()

Unnamed: 0,title
0,
1,Ask HN: Explain why apps like FB/Shopify are m...
2,
3,
4,


In [14]:
# Preview the first five entries in the "by" column of the "full" table
# looking at only 2 columns
client.list_rows(table, selected_fields=table.schema[:2], max_results=5).to_dataframe()

Unnamed: 0,title,url
0,,
1,Ask HN: Explain why apps like FB/Shopify are m...,
2,,
3,,
4,,


In [15]:
# looking only last column
# Preview the first five entries in the "by" column of the "full" table
client.list_rows(table, selected_fields=table.schema[-1:], max_results=5).to_dataframe()

Unnamed: 0,deleted
0,
1,
2,
3,
4,
