In [1]:
from google.cloud import bigquery

In [3]:
client = bigquery.Client()
dataset_ref = client.dataset("github_repos", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)

license_ref = dataset_ref.table("licenses")
license_table = client.get_table(license_ref)

client.list_rows(license_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,license
0,azuredream/chat_server-client,artistic-2.0
1,Egyptian19/JemCraft,artistic-2.0
2,ZioRiP/cookie,artistic-2.0
3,ajs/perl6-log,artistic-2.0
4,JohanPotgieter/Internet,artistic-2.0


In [4]:
files_ref = dataset_ref.table("sample_files")
files_table = client.get_table(files_ref)

client.list_rows(files_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,git/git,refs/heads/master,RelNotes,40960,62615ffa4e97803da96aefbc798ab50f949a8db7,Documentation/RelNotes/2.10.0.txt
1,np/ling,refs/heads/master,tests/success/plug_compose.t/plug_compose.ll,40960,0c1605e4b447158085656487dc477f7670c4bac1,../../../fixtures/all/plug_compose.ll
2,np/ling,refs/heads/master,fixtures/strict-par-success/parallel_assoc_lef...,40960,b59bff84ec03d12fabd3b51a27ed7e39a180097e,../all/parallel_assoc_left.ll
3,np/ling,refs/heads/master,fixtures/sequence/parallel_assoc_2tensor2_left.ll,40960,f29523e3fb65702d99478e429eac6f801f32152b,../all/parallel_assoc_2tensor2_left.ll
4,np/ling,refs/heads/master,fixtures/success/my_dual.ll,40960,38a3af095088f90dfc956cb990e893909c3ab286,../all/my_dual.ll


In [7]:
query1 = """
        SELECT L.license, COUNT(1) AS number_of_files
        FROM `bigquery-public-data.github_repos.sample_files` AS sf
        INNER JOIN `bigquery-public-data.github_repos.licenses` AS L
                ON sf.repo_name = L.repo_name
        GROUP BY L.license
        ORDER BY number_of_files DESC
        """

In [6]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed = 10**10)

def get_query(my_query):
    my_query_job = client.query(my_query, job_config=safe_config)
    my_query_res = my_query_job.to_dataframe()
    return my_query_res

In [9]:
my_res = get_query(query1)
my_res

Unnamed: 0,license,number_of_files
0,mit,20432844
1,gpl-2.0,16867410
2,apache-2.0,7123968
3,gpl-3.0,4936531
4,bsd-3-clause,2943900
5,agpl-3.0,1293773
6,lgpl-2.1,793054
7,bsd-2-clause,694767
8,lgpl-3.0,564433
9,mpl-2.0,473078


## Exercise

In [10]:
dataset_ref = client.dataset("stackoverflow", project='bigquery-public-data')
dataset = client.get_dataset(dataset_ref)

In [12]:
table_list = [i.table_id for i in client.list_tables(dataset)]
table_list

['badges',
 'comments',
 'post_history',
 'post_links',
 'posts_answers',
 'posts_moderator_nomination',
 'posts_orphaned_tag_wiki',
 'posts_privilege_wiki',
 'posts_questions',
 'posts_tag_wiki',
 'posts_tag_wiki_excerpt',
 'posts_wiki_placeholder',
 'stackoverflow_posts',
 'tags',
 'users',
 'votes']

In [13]:
answers_table_ref = dataset_ref.table("posts_answers")
answers_table = client.get_table(answers_table_ref)
client.list_rows(answers_table, max_results=5).to_dataframe()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,58545647,,"<p>You can implement the <a href=""https://docs...",,,0,,2019-10-24 16:35:51.947000+00:00,,2019-10-24 16:35:51.947000+00:00,,,,,2541560,58545487,2,0,,
1,58545649,,"<p>You may be having an issue with the ""stage""...",,,0,,2019-10-24 16:35:59.377000+00:00,,2019-10-24 16:35:59.377000+00:00,,,,,4434749,56565949,2,0,,
2,58545664,,<p>I am not sure why you need that exactly but...,,,0,,2019-10-24 16:36:39.870000+00:00,,2019-10-24 16:36:39.870000+00:00,,,,,8343843,58545068,2,0,,
3,58545675,,<pre><code>Object delegateObj = readField(valu...,,,1,,2019-10-24 16:37:20.207000+00:00,,2019-10-24 16:37:20.207000+00:00,,,,,12269981,57195785,2,0,,
4,58545677,,<p>I had to remove the line</p>\n\n<pre><code>...,,,0,,2019-10-24 16:37:51.253000+00:00,,2019-10-24 16:37:51.253000+00:00,,,,,1775258,58428566,2,0,,


In [24]:
questions_table_ref = dataset_ref.table("posts_questions")
questions_table = client.get_table(questions_table_ref)
client.list_rows(questions_table, max_results=5).to_dataframe()

Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,31098630,IntelliJ IDEA - JPanel in JScrollpane no conte...,<p><br>\nI have a JScrollPane which needs to b...,,1,0,,2015-06-28 10:18:28.330000+00:00,,2015-06-28 11:21:40.713000+00:00,NaT,,,,4314870,,1,0,java|swing|intellij-idea|jframe,256
1,31102757,"SDL2: I Receive Segmentation Fault on Line 5, ...",<p>When I run the program I get a segmentation...,,1,0,,2015-06-28 17:40:02.090000+00:00,,2015-07-01 10:04:04.650000+00:00,2015-07-01 10:03:25.343000+00:00,,2564301.0,,4969570,,1,-2,c++|c|gcc|sdl|sdl-2,256
2,31104792,angularjs with nodejs the ng-repeat doesn´t work,"<p>This is my angular code <a href=""http://pas...",31104833.0,1,0,,2015-06-28 21:10:19.913000+00:00,,2015-06-28 21:15:17.693000+00:00,NaT,,,,5059050,,1,0,angularjs|node.js|socket.io|ng-repeat,256
3,31106810,How to know the cache line of A8 chip is 64 by...,<p>How the get the information of the apple ch...,31154387.0,1,0,,2015-06-29 02:11:34.657000+00:00,,2015-07-01 06:50:46.390000+00:00,2015-06-29 02:13:57.387000+00:00,,87197.0,,1468120,,1,0,ios|processor,256
4,31112648,Projection by interface,<p>I'm wondering what's the best way to perfor...,,1,0,,2015-06-29 09:53:48.253000+00:00,,2015-06-29 10:25:17.367000+00:00,2015-06-29 10:15:04.637000+00:00,,1501208.0,,498298,,1,2,c#|mongodb|mongodb-.net-driver,256


In [15]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed= 10**10)

In [23]:
questions_query = """
                SELECT id, title, owner_user_id
                FROM `bigquery-public-data.stackoverflow.posts_questions`
                WHERE tags LIKE '%bigquery%'
                """
questions_query_ans = get_query(questions_query)
questions_query_ans.head()

Unnamed: 0,id,title,owner_user_id
0,31255937,Bigquery Free Trial limitations,5084442.0
1,31464373,Trouble Looking For Events WITHIN a Session In...,2114472.0
2,31240582,bq show cames to return no Total Rows and Tota...,5084312.0
3,31526966,Semijoin expression must be a part of logical AND,2800623.0
4,31319016,How to get Day name in Google BigQuery,4961403.0


In [21]:
query_2 = """
        SELECT a.id, a.body, a.owner_user_id
        FROM `bigquery-public-data.stackoverflow.posts_questions` as q
        INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` as a
            ON q.id = a.parent_id
        WHERE q.tags LIKE '%bigquery%'
        """

query_ans_2 = get_query(query_2)
query_ans_2.head()

BadRequest: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/kaggle-project-271505/queries/27c983c6-c0be-437f-9b2a-ee411806fc27?maxResults=0&location=US: Query exceeded limit for bytes billed: 10000000000. 23881318400 or higher required.

(job ID: 27c983c6-c0be-437f-9b2a-ee411806fc27)

                        -----Query Job SQL Follows-----                        

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:        SELECT a.id, a.body, a.owner_user_id
   3:        FROM `bigquery-public-data.stackoverflow.posts_questions` as q
   4:        INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` as a
   5:            ON q.id = a.parent_id
   6:        WHERE q.tags LIKE '%bigquery%'
   7:        
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

In [25]:
answers_query = """
                SELECT a.id, a.body, a.owner_user_id
                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
                INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                    ON q.id = a.parent_id
                WHERE q.tags LIKE '%bigquery%'
                """
# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
answers_query_job = client.query(answers_query, job_config=safe_config) # Your code goes here

# API request - run the query, and return a pandas DataFrame
answers_results = answers_query_job.to_dataframe()# Your code goes here

# Preview results
print(answers_results.head())

BadRequest: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/kaggle-project-271505/queries/2ed3e0fd-c230-481a-83ad-16537b3c37cc?maxResults=0&location=US: Query exceeded limit for bytes billed: 10000000000. 23881318400 or higher required.

(job ID: 2ed3e0fd-c230-481a-83ad-16537b3c37cc)

                            -----Query Job SQL Follows-----                            

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:                SELECT a.id, a.body, a.owner_user_id
   3:                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
   4:                INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
   5:                    ON q.id = a.parent_id
   6:                WHERE q.tags LIKE '%bigquery%'
   7:                
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

In [47]:
topic = 'bigquery'
bigquery_experts_query = """
                SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
                INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                    ON q.id = a.parent_id
                WHERE q.tags CONTAINS @topic
                GROUP BY user_id
                """

bigquery_experts_ans = get_query(bigquery_experts_query)
bigquery_experts_ans.head()

BadRequest: 400 Syntax error: Unexpected keyword CONTAINS at [6:30]

(job ID: 9d2b6acb-cc22-45b9-a49a-6debbeb6713c)

                            -----Query Job SQL Follows-----                            

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:                SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
   3:                FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
   4:                INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
   5:                    ON q.id = a.parent_id
   6:                WHERE q.tags CONTAINS @topic
   7:                GROUP BY user_id
   8:                
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |

In [44]:
def expert_finder(topic, my_client):
    my_expert_query = """
                    SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
                    FROM `bigquery-public-data.stackoverflow.posts_questions` AS q 
                    INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                        ON q.id = a.parent_id
                    WHERE q.tags LIKE '%@topic%'
                    GROUP BY a.owner_user_id
                    """    
    
    safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
    my_query_job = my_client.query(my_expert_query, job_config=safe_config)
    my_ans = my_query_job.to_dataframe()
    return my_ans

In [45]:
my_topic = "bigquery"
my_ans = expert_finder(my_topic, client)
my_ans.head()

Unnamed: 0,user_id,number_of_answers
