In [282]:
# imports
import requests
import json
import csv
import time
import datetime
from string import Template
from Config import *
import sqlite3
from sqlite3 import Error

In [283]:
# Variables
headers = {"Authorization": API_TOKEN}
min_stars = 237
max_stars = 10000
last_activity = 90 # within the last __ days
created = 364 * 4 # within the last __ days
min_pull_num = 0 # amount of pull requests a repository needs
watchers = 0 # amount of watchers a repository needs

In [284]:
# Builds the query filter string compatible to github
def query_filter( min_stars, last_activity, created ):
    date_last_act = datetime.datetime.now() - datetime.timedelta( days=last_activity )
    date_created = datetime.datetime.now() - datetime.timedelta( days=created )
    stars = f'{min_stars}..1000'

    return f'is:public archived:false fork:false stars:{stars} pushed:20{date_last_act:%y-%m-%d}..* created:20{date_created:%y-%m-%d}..* sort:stars-asc'

In [285]:
# Funtion that uses requests.post to make the API call
def run_query(query, variables):
    request = requests.post('https://api.github.com/graphql', 
                            json={'query': query, 'variables' : variables}, headers=headers)
    #if request.status_code == 200:
        
    if 'Retry-After' in request.headers:  # reached retry limit
        print(f'[WAITING] for {request.headers["Retry-After"]} seconds before continuing...', end=' ')
        time.sleep(int(request.headers['Retry-After']))

    return request.json()
    #else:
    #    raise Exception(f'ERROR [{request.status_code}]: Query failed to execute...\nRESPONSE: {request.text}')

In [286]:
end_cursor = ""
owner = "astropy"
name = "astropy"
query_string = query_filter( min_stars, last_activity, created)
variables = {
    "owner" : owner,
    "name" : name,
    "end_cursor" : end_cursor,
    "pRsPerPage" : 1 # from 1 to 100
}

In [287]:
# setups the github graphql queries
def setup_query( variables, end_cursor ) :
    variables['end_cursor'] = end_cursor
    query = '''
    query($owner : String!, $name : String!, $pRsPerPage : Int) { 
      repository(owner: $owner, name: $name) {
        name
        pullRequests (first: $pRsPerPage) {
          pageInfo {
            hasNextPage
            endCursor
          }
          nodes {
            author {
                login
            }
            id
            title
            number
            closed
            closedAt
            authorAssociation
            bodyText
            additions
            changedFiles
            deletions
            merged
            mergedAt
            state
            comments(first: 100) {
              edges {
                node {
                  id
                  author {
                    login
                  }
                  authorAssociation
                  bodyText
                  createdAt
                }
              }
            }
            reviewThreads(first:100) {
              edges {
                node {
                  comments(first:100){
                    nodes {
                      id
                      author {
                        login
                      }
                      authorAssociation
                      bodyText
                      createdAt
                    }
                  }
                }
              }
            }
          }
        }
    }
}'''
    query2 = '''
    query($owner : String!, $name : String!, $pRsPerPage : Int, $end_cursor : String!) { 
      repository(owner: $owner, name: $name) {
        name
        pullRequests (first: $pRsPerPage, after: $end_cursor) {
          pageInfo {
            hasNextPage
            endCursor
          }
          nodes {
            author {
                login
            }
            id
            title
            number
            closed
            closedAt
            authorAssociation
            bodyText
            additions
            changedFiles
            deletions
            merged
            mergedAt
            state
            comments(first: 100) {
              edges {
                node {
                  id
                  author {
                    login
                  }
                  authorAssociation
                  bodyText
                  createdAt
                }
              }
            }
            reviewThreads(first:100) {
              edges {
                node {
                  comments(first:100){
                    nodes {
                      id
                      author {
                        login
                      }
                      authorAssociation
                      bodyText
                      createdAt
                    }
                  }
                }
              }
            }
          }
        }
    }
}'''

    if(end_cursor == "") : return (query, variables)
    else : return (query2, variables)

In [288]:
def setup_user_query() :
    query = '''
    query($username : String!) { 
  user(login: $username) {
    id
    pullRequests(first:1) {
      totalCount
    }
    repositories(first:1) {
      totalCount
    }
    repositoriesContributedTo(first:1) {
      totalCount
    }
  }
}'''
    return query

In [289]:
# Runs the query and iterates through all pages of repositories
def iterate_queries( conn, create_dict_method ):

    end_cursor = ""
    end_cursor_string = ""
    hasNextPage = True
    index = 0
    
    ai = 1 # slow start: 1, 2, 4, 8 (max)
    md = 0.5
    
    print("[WORKING] Running script to collect all repositories. ")
    while( hasNextPage ):
        print("[WORKING] On page " + str(index))
        query = setup_query( variables, end_cursor )
        result = run_query( query[0], query[1] )
        print(json.dumps(result))
        
        if 'errors' in result:
            if 'timeout' in result['errors'][0]['message']:  # reached timeout
                variables['pRsPerPage'] = int(max(1, variables['pRsPerPage'] * md))  # using AIMD
                ai = 1  # resetting slow start
                print('[WORKING] Timeout! - Ressting page size to : ' + 
                      str(variables['pRsPerPage']))
            else:  # some unexpected error.
                print(result['errors'])
                exit(1)
        
        if 'data' in result and result['data']:
            
            #print(json.dumps(result))
                      
            # insert_repositories( conn, result, create_dict_method )
            insert_users( conn, result, create_user_tuple )
            
            try:
                # if there is a next page, update the endcursor string and continue loop
                if( result["data"]["repository"]["pullRequests"]["pageInfo"]["hasNextPage"] ):
                    end_cursor = result["data"]["repository"]["pullRequests"]["pageInfo"]["endCursor"]
                    
                    variables['pRsPerPage'] = min(100, variables['pRsPerPage'] + ai)  # using AIMD
                    print("[WORKING] New pRsPerPage set to : " + str(variables['pRsPerPage']))
                    ai = min(8, ai * 2)  # slow start
                else:
                    if(result['data']['repository']['pullRequests']['totalCount'] > 1000) :
                        print('[WORKING] We reached the limit of 1,000 repositories.')
                        hasNextPage = False
                        end_cursor = ""
                    else :
                        hasNextPage = False
                        end_cursor = ""
                        print(json.dumps(result))
            except KeyError:
                print("[WORKING] No next page. ")
                break 

            index += 1
        time.sleep(1)

In [290]:
def create_user_tuple( conn, username ) :
    print("[WORKING] Attempting to insert " + username + " into database. ")
    user_variables = { 'username' : username }
    query = setup_user_query()
    result = run_query( query, user_variables )
    
    if 'errors' in result:
        print(result['errors'])
        exit(1)
    
    user_tuple = (
    result['data']['user']['id'],
    username,
    result['data']['user']['pullRequests']['totalCount'],
    result['data']['user']['repositories']['totalCount'],
    result['data']['user']['repositoriesContributedTo']['totalCount'] )
    
    return user_tuple

In [291]:
# insert repository tuples into the sqlite database
def insert_users( conn, result, create_user_method ):
    
    data_list = []
    name_set = set()
    # insert PR user
    pr_nodes = result['data']['repository']['pullRequests']['nodes']
    for pr_node in pr_nodes:
        name_set.add( pr_node['author']['login'] )
        #data_list.append( create_user_method( conn, pr_node['author']['login'] ) )
        
        # insert comments users
        comm_edges = pr_node['comments']['edges']
        for comm_edge in comm_edges:
            name_set.add( comm_edge['node']['author']['login'] )
            #data_list.append( create_user_method( conn, comm_edge['node']['author']['login'] ) )
        
        # insert review comments users
        review_edges = pr_node['reviewThreads']['edges']
        for review_edge in review_edges:
            review_comm_nodes = review_edge['node']['comments']['nodes']
            for review_comm_node in review_comm_nodes:
                name_set.add( review_comm_node['author']['login'] )
                #data_list.append( create_user_method( conn, review_comm_node['author']['login'] ) )
                
    for name in name_set:
        data_list.append( create_user_method( conn, name ) )
                
    """
    Create a new repository into the repository table
    :param conn:
    :param repository:
    :return: project id
    """
    sql = """INSERT or IGNORE INTO users (id, name, pr_total, repo_total, repo_contributed_total) VALUES
                     (?, ?, ?, ?, ?)"""
    
    cur = conn.cursor()
    cur.executemany(sql, data_list)
    conn.commit()
    print("[SUCCESS] Inserted users into the database. ")
    cur.close()
    return cur.lastrowid

In [292]:
def create_repo_tuple( node ) :
    commits = 0
    if(node["commits"]["target"]["history"]["totalCount"] != None) :
        commits = node["commits"]["target"]["history"]["totalCount"]
        
    primaryLanguage = "null"
    if(node["primaryLanguage"] != None) :
        primaryLanguage = node["primaryLanguage"]["name"]
        
    license_id = "0"
    license_name = "null"
    pseudoLicense = False
    if(node["licenseInfo"] != None) :
        license_id = node["licenseInfo"]["id"]
        license_name = node["licenseInfo"]["name"]
        pseudoLicense = node["licenseInfo"]["pseudoLicense"]
        
    return (
    node["id"], 
    node["name"], 
    node["owner"]["login"], 
    node["createdAt"], 
    node["isMirror"], 
    node["isFork"], 
    node["diskUsage"], 
    primaryLanguage,
    node["contributors"]["totalCount"], 
    node["watchers"]["totalCount"], 
    node["stargazers"]["totalCount"], 
    node["forkCount"], 
    node["issues"]["totalCount"], 
    commits, 
    node["pullRequests"]["totalCount"], 
    node["releases"]["totalCount"],
    license_id, 
    license_name, 
    pseudoLicense, 
    node["url"]
    )

In [293]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
        
    return conn

In [294]:
def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [295]:
# insert repository tuples into the sqlite database
def insert_repositories( conn, result, create_dict_method ):
    
    data_list = []
    nodes = result["data"]["search"]["nodes"]
    for node in nodes:
        data_list.append( create_dict_method( node ) )
    
    """
    Create a new repository into the repository table
    :param conn:
    :param repository:
    :return: project id
    """
    sql = """INSERT or IGNORE INTO repositories (id, name, owner, start_date, isMirror, isFork, diskUsage, 
                     primaryLanguage, numContributors, watchers, stars, forks, issues, commits, 
                     pullRequests, releases, license_id, license_name, pseudoLicense, url) VALUES
                     (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
    
    cur = conn.cursor()
    cur.executemany(sql, data_list)
    conn.commit()
    print("[SUCCESS] Inserted repos into the database. ")
    cur.close()
    return cur.lastrowid

In [296]:
# name of the sqlite file
#database = r".\databases\github_data.db"

# create a database connection
#conn = create_connection(database)

In [297]:
# sql for creating the prs and comments tables
sql_create_prs_table = """CREATE TABLE IF NOT EXISTS pull_requests (
                                        id text PRIMARY KEY NOT NULL,
                                        repo_id text FORIEGN KEY NOT NULL,
                                        user_id text FORIEGN KEY NOT NULL,
                                        author text NOT NULL,
                                        number text NOT NULL,
                                        closed boolean NOT NULL,
                                        authorAssoc text NOT NULL,
                                        bodyText text NOT NULL,
                                        additions int NOT NULL,
                                        deletions int NOT NULL,
                                        changedFiles int NOT NULL,
                                        merged boolean NOT NULL,
                                        mergedAt text NOT NULL,
                                        state text NOT NULL
                                    ); """

sql_create_comments_table = """CREATE TABLE IF NOT EXISTS comments (
                                        id text PRIMARY KEY NOT NULL,
                                        pr_id text FORIEGN KEY NOT NULL,
                                        user_id text FORIEGN KEY NOT NULL,
                                        author text NOT NULL,
                                        authorAssoc text NOT NULL,
                                        bodyText text NOT NULL,
                                        createdAt text NOT NULL
                                    ); """

sql_create_users_table = """CREATE TABLE IF NOT EXISTS users (
                                        id text PRIMARY KEY NOT NULL,
                                        name text NOT NULL,
                                        pr_total int NOT NULL,
                                        repo_total int NOT NULL,
                                        repo_contributed_total int NOT NULL
                                    ); """

with conn:
    create_table(conn, sql_create_prs_table)
    create_table(conn, sql_create_comments_table)
    create_table(conn, sql_create_users_table)

In [298]:
# when conn is valid
with conn:
    # run query to find all repositories
    # pass in the repository tuple builder
    # insert the tuple into the database
    iterate_queries( conn, create_repo_tuple )

[WORKING] Running script to collect all repositories. 
[WORKING] On page 0


UnboundLocalError: local variable 'review_comm_node' referenced before assignment